From e24fd2e34182626b2f82b5a67c99187b3bae5747 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 3 Apr 2023 11:08:52 -0700 Subject: [PATCH 01/75] Initial commit --- .../raft/core/detail/device_type_gpu.hpp | 50 ++ cpp/include/raft/core/device_mdbuffer.hpp | 316 +++++++++++++ cpp/include/raft/core/mdbuffer.hpp | 436 ++++++++++++++++++ 3 files changed, 802 insertions(+) create mode 100644 cpp/include/raft/core/detail/device_type_gpu.hpp create mode 100644 cpp/include/raft/core/device_mdbuffer.hpp create mode 100644 cpp/include/raft/core/mdbuffer.hpp diff --git a/cpp/include/raft/core/detail/device_type_gpu.hpp b/cpp/include/raft/core/detail/device_type_gpu.hpp new file mode 100644 index 0000000000..a04dc3cda0 --- /dev/null +++ b/cpp/include/raft/core/detail/device_type_gpu.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include + +namespace raft { +namespace detail { +template <> +struct device_id { + using value_type = typename rmm::cuda_device_id::value_type; + device_id() noexcept(false) + : id_{[]() { + auto raw_id = value_type{}; + RAFT_CUDA_TRY(cudaGetDevice(&raw_id)); + return raw_id; + }()} {}; + /* We do not mark this constructor as explicit to allow public API + * functions to accept `device_id` arguments without requiring + * downstream consumers to explicitly construct a device_id. Thus, + * consumers can use the type they expect to use when specifying a device + * (int), but once we are inside the public API, the device type remains + * attached to this value and we can easily convert to the strongly-typed + * rmm::cuda_device_id if desired. + */ + device_id(value_type dev_id) noexcept : id_{dev_id} {}; + + auto value() const noexcept { return id_.value(); } + auto rmm_id() const noexcept { return id_; } + + private: + rmm::cuda_device_id id_; +}; +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/device_mdbuffer.hpp b/cpp/include/raft/core/device_mdbuffer.hpp new file mode 100644 index 0000000000..f72ae36d64 --- /dev/null +++ b/cpp/include/raft/core/device_mdbuffer.hpp @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { + +template +using device_accessor = host_device_accessor; + +template +using managed_accessor = host_device_accessor; + +/** + * @brief std::experimental::mdspan with device tag to avoid accessing incorrect memory location. + */ +template > +using device_mdspan = mdspan>; + +template > +using managed_mdspan = mdspan>; + +template +struct is_device_mdspan : std::false_type { +}; +template +struct is_device_mdspan : std::bool_constant { +}; + +/** + * @\brief Boolean to determine if template type T is either raft::device_mdspan or a derived type + */ +template +using is_device_mdspan_t = is_device_mdspan>; + +template +using is_input_device_mdspan_t = is_device_mdspan>; + +template +using is_output_device_mdspan_t = is_device_mdspan>; + +template +struct is_managed_mdspan : std::false_type { +}; +template +struct is_managed_mdspan : std::bool_constant { +}; + +/** + * @\brief Boolean to determine if template type T is either raft::managed_mdspan or a derived type + */ +template +using is_managed_mdspan_t = is_managed_mdspan>; + +template +using is_input_managed_mdspan_t = is_managed_mdspan>; + +template +using is_output_managed_mdspan_t = is_managed_mdspan>; + +/** + * @\brief Boolean to determine if variadic template types Tn are either raft::device_mdspan or a + * derived type + */ +template +inline constexpr bool is_device_mdspan_v = std::conjunction_v...>; + +template +inline constexpr bool is_input_device_mdspan_v = + std::conjunction_v...>; + +template +inline constexpr bool is_output_device_mdspan_v = + std::conjunction_v...>; + +template +using enable_if_device_mdspan = std::enable_if_t>; + +template +using enable_if_input_device_mdspan = std::enable_if_t>; + +template +using enable_if_output_device_mdspan = std::enable_if_t>; + +/** + * @\brief Boolean to determine if variadic template types Tn are either raft::managed_mdspan or a + * derived type + */ +template +inline constexpr bool is_managed_mdspan_v = std::conjunction_v...>; + +template +inline constexpr bool is_input_managed_mdspan_v = + std::conjunction_v...>; + +template +inline constexpr bool is_output_managed_mdspan_v = + std::conjunction_v...>; + +template +using enable_if_managed_mdspan = std::enable_if_t>; + +template +using enable_if_input_managed_mdspan = std::enable_if_t>; + +template +using enable_if_output_managed_mdspan = std::enable_if_t>; + +/** + * @brief Shorthand for 0-dim host mdspan (scalar). + * @tparam ElementType the data type of the scalar element + * @tparam IndexType the index type of the extents + */ +template +using device_scalar_view = device_mdspan>; + +/** + * @brief Shorthand for 1-dim device mdspan. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using device_vector_view = device_mdspan, LayoutPolicy>; + +/** + * @brief Shorthand for c-contiguous device matrix view. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + */ +template +using device_matrix_view = device_mdspan, LayoutPolicy>; + +/** + * @brief Shorthand for 128 byte aligned device matrix view. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy must be of type layout_{left/right}_padded + */ +template , + typename = enable_if_layout_padded> +using device_aligned_matrix_view = + device_mdspan, + LayoutPolicy, + std::experimental::aligned_accessor>; + +/** + * @brief Create a 2-dim 128 byte aligned mdspan instance for device pointer. It's + * expected that the given layout policy match the layout of the underlying + * pointer. + * @tparam ElementType the data type of the matrix elements + * @tparam LayoutPolicy must be of type layout_{left/right}_padded + * @tparam IndexType the index type of the extents + * @param[in] ptr on device to wrap + * @param[in] n_rows number of rows in pointer + * @param[in] n_cols number of columns in pointer + */ +template > +auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) +{ + using data_handle_type = + typename std::experimental::aligned_accessor::data_handle_type; + static_assert(std::is_same>::value || + std::is_same>::value); + assert(reinterpret_cast(ptr) == + std::experimental::details::alignTo(reinterpret_cast(ptr), + detail::alignment::value)); + + data_handle_type aligned_pointer = ptr; + + matrix_extent extents{n_rows, n_cols}; + return device_aligned_matrix_view{aligned_pointer, extents}; +} + +/** + * @brief Create a raft::managed_mdspan + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param ptr Pointer to the data + * @param exts dimensionality of the array (series of integers) + * @return raft::managed_mdspan + */ +template +auto make_managed_mdspan(ElementType* ptr, extents exts) +{ + return make_mdspan(ptr, exts); +} + +/** + * @brief Create a 0-dim (scalar) mdspan instance for device value. + * + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @param[in] ptr on device to wrap + */ +template +auto make_device_scalar_view(ElementType* ptr) +{ + scalar_extent extents; + return device_scalar_view{ptr, extents}; +} + +/** + * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's + * expected that the given layout policy match the layout of the underlying + * pointer. + * @tparam ElementType the data type of the matrix elements + * @tparam LayoutPolicy policy for strides and layout ordering + * @tparam IndexType the index type of the extents + * @param[in] ptr on device to wrap + * @param[in] n_rows number of rows in pointer + * @param[in] n_cols number of columns in pointer + */ +template +auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) +{ + matrix_extent extents{n_rows, n_cols}; + return device_matrix_view{ptr, extents}; +} + +/** + * @brief Create a 1-dim mdspan instance for device pointer. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] ptr on device to wrap + * @param[in] n number of elements in pointer + * @return raft::device_vector_view + */ +template +auto make_device_vector_view(ElementType* ptr, IndexType n) +{ + return device_vector_view{ptr, n}; +} + +/** + * @brief Create a 1-dim mdspan instance for device pointer. + * @tparam ElementType the data type of the vector elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @param[in] ptr on device to wrap + * @param[in] mapping The layout mapping to use for this vector + * @return raft::device_vector_view + */ +template +auto make_device_vector_view( + ElementType* ptr, + const typename LayoutPolicy::template mapping>& mapping) +{ + return device_vector_view{ptr, mapping}; +} + +/** + * @brief Construct a strided vector layout mapping + * + * Usage example: + * @code{.cpp} + * #include + * + * int n_elements = 10; + * int stride = 10; + * auto vector = raft::make_device_vector_view(vector_ptr, + * raft::make_vector_strided_layout(n_elements, stride)); + * @endcode + * + * @tparam IndexType the index type of the extents + * @param[in] n the number of elements in the vector + * @param[in] stride the stride between elements in the vector + */ +template +auto make_vector_strided_layout(IndexType n, IndexType stride) +{ + return make_strided_layout(vector_extent{n}, std::array{stride}); +} +} // end namespace raft diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp new file mode 100644 index 0000000000..6588dc41d1 --- /dev/null +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace raft { + +template > +using mdspan = std::experimental::mdspan; + +namespace detail { + +template +struct device_id { + using value_type = int; + + device_id(value_type device_index) {} + auto value() const { return value_type{}; } +}; + +template <> +struct device_id { + using value_type = int; + device_id() : id_{value_type{}} {}; + device_id(value_type dev_id) : id_{dev_id} {}; + + auto value() const noexcept { return id_; } + private: + value_type id_; +}; + +template<> +struct device_id { + using value_type = typename rmm::cuda_device_id::value_type; + device_id() noexcept(false) : id_{[](){ + auto raw_id = value_type{}; + RAFT_CUDA_CHECK(cudaGetDevice(&raw_id)); + return raw_id; + }()} {}; + device_id(value_type dev_id) noexcept : id_{dev_id} {}; + + auto value() const noexcept { return id_.value(); } + private: + rmm::cuda_device_id id_; +}; + +template +class non_owning_buffer { + using value_type = std::remove_const_t; + non_owning_buffer() : data_{nullptr} { } + + explicit non_owning_buffer(T* ptr) : data_{ptr} { } + + T* get() const { return data_; } + + private: + // TODO(wphicks): Back this with RMM-allocated host memory + T* data_; +}; + +template +class owning_buffer { + owning_buffer() {} + owning_buffer(device_id device_id, std::size_t size, cuda_stream stream) {} + auto* get() const { return static_cast(nullptr); } +}; + +template +class owning_buffer { + // TODO(wphicks): Assess need for buffers of const T + using value_type = std::remove_const_t; + owning_buffer() : data_{} {} + + owning_buffer(device_id device_id, std::size_t size, cudaStream_t stream) noexcept(false) + : data_{[&device_id, &size, &stream]() { + auto device_context = device_setter{device_id}; + return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}}; + }()} + { + } + + auto* get() const { return reinterpret_cast(data_.data()); } + + private: + mutable rmm::device_buffer data_; +}; + +template +class buffer { + buffer() { + } + + buffer(device_, data_, size_, cached_ptr) : + device_(device_), data_(data_), size_(size_), cached_ptr(cached_ptr) { + } + + buffer(device_) + private: + device_id_variant device_; + data_store data_; + size_t size_; + T* cached_ptr; +}; + +// alignment fixed to 128 bytes +struct alignment { + static constexpr size_t value = 128; +}; + +} // namespace detail + +template +using layout_right_padded = std::experimental::layout_right_padded< + detail::padding>>::value>; + +template +using layout_left_padded = std::experimental::layout_left_padded< + detail::padding>>::value>; + +template +using enable_if_layout_padded = + std::enable_if_t>::value || + std::is_same>::value>; + +/** + * Ensure all types listed in the parameter pack `Extents` are integral types. + * Usage: + * put it as the last nameless template parameter of a function: + * `typename = ensure_integral_extents` + */ +template +using ensure_integral_extents = std::enable_if_t...>>; + +/** + * @\brief Template checks and helpers to determine if type T is an std::mdspan + * or a derived type + */ + +template +void __takes_an_mdspan_ptr(mdspan*); + +template +struct is_mdspan : std::false_type { +}; +template +struct is_mdspan()))>> + : std::true_type { +}; + +template +struct is_input_mdspan : std::false_type { +}; +template +struct is_input_mdspan()))>> + : std::bool_constant> { +}; + +template +struct is_output_mdspan : std::false_type { +}; +template +struct is_output_mdspan()))>> + : std::bool_constant> { +}; + +template +using is_mdspan_t = is_mdspan>; + +template +using is_input_mdspan_t = is_input_mdspan; + +template +using is_output_mdspan_t = is_output_mdspan; + +/** + * @\brief Boolean to determine if variadic template types Tn are either + * raft::host_mdspan/raft::device_mdspan or their derived types + */ +template +inline constexpr bool is_mdspan_v = std::conjunction_v...>; + +template +using enable_if_mdspan = std::enable_if_t>; + +template +inline constexpr bool is_input_mdspan_v = std::conjunction_v...>; + +template +using enable_if_input_mdspan = std::enable_if_t>; + +template +inline constexpr bool is_output_mdspan_v = std::conjunction_v...>; + +template +using enable_if_output_mdspan = std::enable_if_t>; + +// uint division optimization inspired by the CIndexer in cupy. Division operation is +// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64 +// bit when the index is smaller, then try to avoid division when it's exp of 2. +template +RAFT_INLINE_FUNCTION auto unravel_index_impl( + I idx, std::experimental::extents shape) +{ + constexpr auto kRank = static_cast(shape.rank()); + std::size_t index[shape.rank()]{0}; // NOLINT + static_assert(std::is_signed::value, + "Don't change the type without changing the for loop."); + for (int32_t dim = kRank; --dim > 0;) { + auto s = static_cast>>(shape.extent(dim)); + if (s & (s - 1)) { + auto t = idx / s; + index[dim] = idx - t * s; + idx = t; + } else { // exp of 2 + index[dim] = idx & (s - 1); + idx >>= detail::popc(s - 1); + } + } + index[0] = idx; + return detail::arr_to_tup(index); +} + +/** + * @brief Create a raft::mdspan + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy policy for strides and layout ordering + * @tparam is_host_accessible whether the data is accessible on host + * @tparam is_device_accessible whether the data is accessible on device + * @param ptr Pointer to the data + * @param exts dimensionality of the array (series of integers) + * @return raft::mdspan + */ +template +constexpr auto make_mdspan(ElementType* ptr, extents exts) +{ + using accessor_type = host_device_accessor< + std::experimental::default_accessor, + detail::memory_type_from_access()>; + /*using accessor_type = host_device_accessor, + mem_type>; */ + + return mdspan{ptr, exts}; +} + +/** + * @brief Create a layout_stride mapping from extents and strides + * @param[in] extents the dimensionality of the layout + * @param[in] strides the strides between elements in the layout + * @return raft::layout_stride::mapping + */ +template +auto make_strided_layout(Extents extents, Strides strides) +{ + return layout_stride::mapping{extents, strides}; +} + +/** + * @brief Create raft::extents to specify dimensionality + * + * @tparam IndexType The type of each dimension of the extents + * @tparam Extents Dimensions (a series of integers) + * @param exts The desired dimensions + * @return raft::extents + */ +template > +constexpr auto make_extents(Extents... exts) +{ + return extents{exts...}; +} + +/** + * @brief Flatten raft::mdspan into a 1-dim array view + * + * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan + * @param mds raft::host_mdspan or raft::device_mdspan object + * @return raft::host_mdspan or raft::device_mdspan with vector_extent + * depending on AccessoryPolicy + */ +template > +auto flatten(mdspan_type mds) +{ + RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); + + vector_extent ext{mds.size()}; + + return std::experimental::mdspan(mds.data_handle(), ext); +} + +/** + * @brief Reshape raft::host_mdspan or raft::device_mdspan + * + * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan + * @tparam IndexType the index type of the extents + * @tparam Extents raft::extents for dimensions + * @param mds raft::host_mdspan or raft::device_mdspan object + * @param new_shape Desired new shape of the input + * @return raft::host_mdspan or raft::device_mdspan, depending on AccessorPolicy + */ +template > +auto reshape(mdspan_type mds, extents new_shape) +{ + RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); + + size_t new_size = 1; + for (size_t i = 0; i < new_shape.rank(); ++i) { + new_size *= new_shape.extent(i); + } + RAFT_EXPECTS(new_size == mds.size(), "Cannot reshape array with size mismatch"); + + return std::experimental::mdspan(mds.data_handle(), + new_shape); +} + +/** + * \brief Turns linear index into coordinate. Similar to numpy unravel_index. + * + * \code + * auto m = make_host_matrix(7, 6); + * auto m_v = m.view(); + * auto coord = unravel_index(2, m.extents(), typename decltype(m)::layout_type{}); + * std::apply(m_v, coord) = 2; + * \endcode + * + * \param idx The linear index. + * \param shape The shape of the array to use. + * \param layout Must be `layout_c_contiguous` (row-major) in current implementation. + * + * \return A std::tuple that represents the coordinate. + */ +template +RAFT_INLINE_FUNCTION auto unravel_index(Idx idx, + extents shape, + LayoutPolicy const& layout) +{ + static_assert(std::is_same_v>, + layout_c_contiguous>, + "Only C layout is supported."); + static_assert(std::is_integral_v, "Index must be integral."); + auto constexpr kIs64 = sizeof(std::remove_cv_t>) == sizeof(uint64_t); + if (kIs64 && static_cast(idx) > std::numeric_limits::max()) { + return unravel_index_impl(static_cast(idx), shape); + } else { + return unravel_index_impl(static_cast(idx), shape); + } +} + +/** + * @brief Const accessor specialization for default_accessor + * + * @tparam ElementType + * @param a + * @return std::experimental::default_accessor> + */ +template +std::experimental::default_accessor> accessor_of_const( + std::experimental::default_accessor a) +{ + return {a}; +} + +/** + * @brief Const accessor specialization for host_device_accessor + * + * @tparam ElementType the data type of the mdspan elements + * @tparam MemType the type of memory where the elements are stored. + * @param a host_device_accessor + * @return host_device_accessor>, + * MemType> + */ +template +host_device_accessor>, MemType> +accessor_of_const(host_device_accessor, MemType> a) +{ + return {a}; +} + +/** + * @brief Create a copy of the given mdspan with const element type + * + * @tparam ElementType the const-qualified data type of the mdspan elements + * @tparam Extents raft::extents for dimensions + * @tparam Layout policy for strides and layout ordering + * @tparam Accessor Accessor policy for the input and output + * @param mds raft::mdspan object + * @return raft::mdspan + */ +template +auto make_const_mdspan(mdspan mds) +{ + auto acc_c = accessor_of_const(mds.accessor()); + return mdspan, Extents, Layout, decltype(acc_c)>{ + mds.data_handle(), mds.mapping(), acc_c}; +} + +} // namespace raft From 07dabfe8e2da416ae37c3d02fd5e2d8a2d91b8f8 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 6 Apr 2023 10:14:58 -0700 Subject: [PATCH 02/75] New commit --- cpp/include/raft/core/buffer_copy.hpp | 70 ++ .../core/detail/buffer_utils/copy_cpu.hpp | 37 + .../core/detail/buffer_utils/copy_gpu.hpp | 36 + .../detail/buffer_utils/non_owning_buffer.hpp | 36 + .../detail/buffer_utils/owning_buffer.hpp | 28 + .../buffer_utils/owning_buffer_base.hpp | 33 + .../detail/buffer_utils/owning_buffer_cpu.hpp | 46 ++ .../detail/buffer_utils/owning_buffer_gpu.hpp | 45 ++ .../raft/core/detail/const_agnostic.hpp | 27 + .../raft/core/detail/device_setter_base.hpp | 30 + .../raft/core/detail/device_setter_gpu.hpp | 46 ++ .../core/detail/execution_device_id_base.hpp | 29 + .../core/detail/execution_device_id_cpu.hpp | 33 + ...pe_gpu.hpp => execution_device_id_gpu.hpp} | 2 +- cpp/include/raft/core/device_setter.hpp | 27 + cpp/include/raft/core/device_support.hpp | 32 + cpp/include/raft/core/device_type.hpp | 22 + cpp/include/raft/core/exceptions.hpp | 71 ++ cpp/include/raft/core/execution_device_id.hpp | 31 + cpp/include/raft/core/execution_stream.hpp | 32 + cpp/include/raft/core/mdbuffer.hpp | 701 ++++++++---------- 21 files changed, 1030 insertions(+), 384 deletions(-) create mode 100644 cpp/include/raft/core/buffer_copy.hpp create mode 100644 cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp create mode 100644 cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp create mode 100644 cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp create mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp create mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp create mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp create mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp create mode 100644 cpp/include/raft/core/detail/const_agnostic.hpp create mode 100644 cpp/include/raft/core/detail/device_setter_base.hpp create mode 100644 cpp/include/raft/core/detail/device_setter_gpu.hpp create mode 100644 cpp/include/raft/core/detail/execution_device_id_base.hpp create mode 100644 cpp/include/raft/core/detail/execution_device_id_cpu.hpp rename cpp/include/raft/core/detail/{device_type_gpu.hpp => execution_device_id_gpu.hpp} (97%) create mode 100644 cpp/include/raft/core/device_setter.hpp create mode 100644 cpp/include/raft/core/device_support.hpp create mode 100644 cpp/include/raft/core/device_type.hpp create mode 100644 cpp/include/raft/core/exceptions.hpp create mode 100644 cpp/include/raft/core/execution_device_id.hpp create mode 100644 cpp/include/raft/core/execution_stream.hpp diff --git a/cpp/include/raft/core/buffer_copy.hpp b/cpp/include/raft/core/buffer_copy.hpp new file mode 100644 index 0000000000..1595219a7f --- /dev/null +++ b/cpp/include/raft/core/buffer_copy.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#ifdef CUML_ENABLE_GPU +#include +#endif +#include + +namespace raft { + +template +void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) { + buffer::detail::copy(dst + dst_offset, src + src_offset, size, execution_stream{}); +} + +template +void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { + buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); +} + +template +void copy(T* dst, T const* src, uint32_t size) { + buffer::detail::copy(dst, src, size, execution_stream{}); +} + +template +void copy(T* dst, T const* src, uint32_t size, execution_stream stream) { + buffer::detail::copy(dst, src, size, stream); +} + +template +void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { + if (dst_type == device_type::gpu && src_type == device_type::gpu) { + buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { + buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { + buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { + buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + } +} + +template +void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) { + copy(dst, src, size, dst_type, src_type, 0, 0, execution_stream{}); +} + +template +void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, execution_stream stream) { + copy(dst, src, size, dst_type, src_type, 0, 0, stream); +} + +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp new file mode 100644 index 0000000000..295909d37b --- /dev/null +++ b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include +#include + +namespace raft { +namespace detail { + +template +std::enable_if_t, std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, execution_stream stream) { + std::copy(src, src + size, dst); +} + +template +std::enable_if_t, std::bool_constant>, std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, execution_stream stream) { + throw raft::cuda_unsupported("Copying from or to device in non-GPU build"); +} + +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp new file mode 100644 index 0000000000..25f692517d --- /dev/null +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "raft/util/cuda_rt_essentials.hpp" +#include "raft/util/cudart_utils.hpp" +#include +#include +#include +#include + +#include +#include + +namespace raft { +namespace detail { + +template +std::enable_if_t, std::bool_constant>, std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) { + RAFT_CUDA_TRY(thrust::copy(rmm::exec_policy(stream), src, src + size, dst)); +} + +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp new file mode 100644 index 0000000000..7f2155e8a2 --- /dev/null +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include + +namespace raft { +namespace detail { +template +class non_owning_buffer { + using value_type = std::remove_const_t; + non_owning_buffer() : data_{nullptr} { } + + non_owning_buffer(T* ptr) : data_{ptr} { } + + auto* get() const { return data_; } + + private: + // TODO(wphicks): Back this with RMM-allocated host memory + T* data_; +}; +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp new file mode 100644 index 0000000000..1d44de6aad --- /dev/null +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include "owning_buffer_cpu.hpp" +#ifdef CUML_ENABLE_GPU +#include "owning_buffer_gpu.hpp" +#endif +namespace raft { +namespace detail { +template +using owning_buffer = owning_buffer; + +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp new file mode 100644 index 0000000000..4c7531dd2d --- /dev/null +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include + +namespace raft { +namespace detail { + +template +class owning_buffer { + owning_buffer() {} + owning_buffer(execution_device_id device_id, std::size_t size, execution_stream stream) {} + auto* get() const { return static_cast(nullptr); } +}; + +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp new file mode 100644 index 0000000000..a4951cd20e --- /dev/null +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include "owning_buffer_base.hpp" +#include + +namespace raft { +namespace detail { +template +class owning_buffer { + // TODO(wphicks): Assess need for buffers of const T + using value_type = std::remove_const_t; + + owning_buffer() + : data_{std::unique_ptr{nullptr}} + { + } + + owning_buffer(std::size_t size) + : data_{std::make_unique(size)} + { + } + + auto* get() const { return data_.get(); } + + private: + // TODO(wphicks): Back this with RMM-allocated host memory + std::unique_ptr data_; +}; +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp new file mode 100644 index 0000000000..1922022755 --- /dev/null +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include +#include "owning_buffer_base.hpp" +#include + +namespace raft { +namespace detail { +template +class owning_buffer { + using value_type = std::remove_const_t; + owning_buffer() : data_{} {} + + owning_buffer(execution_device_id execution_device_id, std::size_t size, cudaStream_t stream) noexcept(false) + : data_{[&execution_device_id, &size, &stream]() { + auto device_context = device_setter{execution_device_id}; + return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}}; + }()} + { + } + + auto* get() const { return reinterpret_cast(data_.data()); } + + private: + mutable rmm::device_buffer data_; +}; +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/const_agnostic.hpp b/cpp/include/raft/core/detail/const_agnostic.hpp new file mode 100644 index 0000000000..e0e20db3dc --- /dev/null +++ b/cpp/include/raft/core/detail/const_agnostic.hpp @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include + +namespace raft::detail { +template +using const_agnostic_same_t = + std::enable_if_t, std::remove_const_t>, V>; + +template +inline constexpr auto const_agnostic_same_v = + std::is_same_v, std::remove_const_t>; +} diff --git a/cpp/include/raft/core/detail/device_setter_base.hpp b/cpp/include/raft/core/detail/device_setter_base.hpp new file mode 100644 index 0000000000..cebc3a5b4d --- /dev/null +++ b/cpp/include/raft/core/detail/device_setter_base.hpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include + +namespace raft { +namespace detail { + +/** Struct for setting current device within a code block */ +template +class device_setter { + device_setter(execution_device_id device) {} +}; + +} +} \ No newline at end of file diff --git a/cpp/include/raft/core/detail/device_setter_gpu.hpp b/cpp/include/raft/core/detail/device_setter_gpu.hpp new file mode 100644 index 0000000000..300fcf766b --- /dev/null +++ b/cpp/include/raft/core/detail/device_setter_gpu.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +// #include +#include +#include +#include +#include + +namespace raft { +namespace detail { + +/** Class for setting current device within a code block */ +template <> +class device_setter { + device_setter(raft::execution_device_id device) noexcept(false) : prev_device_{[]() { + auto result = int{}; + raft::cuda_check(cudaGetDevice(&result)); + return result; + }()} { + raft::cuda_check(cudaSetDevice(device.value())); + } + + ~device_setter() { + RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); + } + private: + device_id prev_device_; +}; + +} +} diff --git a/cpp/include/raft/core/detail/execution_device_id_base.hpp b/cpp/include/raft/core/detail/execution_device_id_base.hpp new file mode 100644 index 0000000000..6af2106771 --- /dev/null +++ b/cpp/include/raft/core/detail/execution_device_id_base.hpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include + +namespace raft { +namespace detail { +template +class device_id { + using value_type = int; + + device_id(value_type device_index) {} + auto value() const { return value_type{}; } +}; +} +} diff --git a/cpp/include/raft/core/detail/execution_device_id_cpu.hpp b/cpp/include/raft/core/detail/execution_device_id_cpu.hpp new file mode 100644 index 0000000000..0892982eff --- /dev/null +++ b/cpp/include/raft/core/detail/execution_device_id_cpu.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include + +namespace raft { +namespace detail { +template <> +class device_id { + using value_type = int; + device_id() : id_{value_type{}} {}; + device_id(value_type dev_id) : id_{dev_id} {}; + + auto value() const noexcept { return id_; } + private: + value_type id_; +}; +} +} \ No newline at end of file diff --git a/cpp/include/raft/core/detail/device_type_gpu.hpp b/cpp/include/raft/core/detail/execution_device_id_gpu.hpp similarity index 97% rename from cpp/include/raft/core/detail/device_type_gpu.hpp rename to cpp/include/raft/core/detail/execution_device_id_gpu.hpp index a04dc3cda0..27015bc92f 100644 --- a/cpp/include/raft/core/detail/device_type_gpu.hpp +++ b/cpp/include/raft/core/detail/execution_device_id_gpu.hpp @@ -22,7 +22,7 @@ namespace raft { namespace detail { template <> -struct device_id { +class device_id { using value_type = typename rmm::cuda_device_id::value_type; device_id() noexcept(false) : id_{[]() { diff --git a/cpp/include/raft/core/device_setter.hpp b/cpp/include/raft/core/device_setter.hpp new file mode 100644 index 0000000000..4f915ae59c --- /dev/null +++ b/cpp/include/raft/core/device_setter.hpp @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#ifdef CUML_ENABLE_GPU +#include +#endif +#include + +namespace raft { + +using device_setter = detail::device_setter; + +} \ No newline at end of file diff --git a/cpp/include/raft/core/device_support.hpp b/cpp/include/raft/core/device_support.hpp new file mode 100644 index 0000000000..f7ab1d7e5a --- /dev/null +++ b/cpp/include/raft/core/device_support.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include + +namespace raft { +#ifdef RAFT_DISABLE_CUDA +auto constexpr static const CUDA_ENABLED = false; +#else +auto constexpr static const CUDA_ENABLED = true; +#endif + +struct cuda_unsupported : raft::exception { + explicit cuda_unsupported(std::string const& msg) : raft::exception{msg} {} + cuda_unsupported() : cuda_unsupported{"CUDA functionality invoked in non-CUDA build"} {} +}; + +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/device_type.hpp b/cpp/include/raft/core/device_type.hpp new file mode 100644 index 0000000000..94a8f88dc1 --- /dev/null +++ b/cpp/include/raft/core/device_type.hpp @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +namespace raft { +enum class device_type { + cpu, + gpu +}; +} \ No newline at end of file diff --git a/cpp/include/raft/core/exceptions.hpp b/cpp/include/raft/core/exceptions.hpp new file mode 100644 index 0000000000..3fe18a2d73 --- /dev/null +++ b/cpp/include/raft/core/exceptions.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include + +namespace raft { +struct bad_cuda_call : std::exception { + bad_cuda_call() : bad_cuda_call("CUDA API call failed") {} + bad_cuda_call(char const* msg) : msg_{msg} {} + virtual char const* what() const noexcept { return msg_; } + + private: + char const* msg_; +}; + +struct out_of_bounds : std::exception { + out_of_bounds() : out_of_bounds("Attempted out-of-bounds memory access") {} + out_of_bounds(char const* msg) : msg_{msg} {} + virtual char const* what() const noexcept { return msg_; } + + private: + char const* msg_; +}; + +struct wrong_device_type : std::exception { + wrong_device_type() : wrong_device_type( + "Attempted to use host data on GPU or device data on CPU" + ) {} + wrong_device_type(char const* msg) : msg_{msg} {} + virtual char const* what() const noexcept { return msg_; } + + private: + char const* msg_; +}; + +struct mem_type_mismatch : std::exception { + mem_type_mismatch() : mem_type_mismatch( + "Memory type does not match expected type" + ) {} + mem_type_mismatch(char const* msg) : msg_{msg} {} + virtual char const* what() const noexcept { return msg_; } + + private: + char const* msg_; +}; + +struct wrong_device : std::exception { + wrong_device() : wrong_device( + "Attempted to use incorrect device" + ) {} + wrong_device(char const* msg) : msg_{msg} {} + virtual char const* what() const noexcept { return msg_; } + + private: + char const* msg_; +}; + +} \ No newline at end of file diff --git a/cpp/include/raft/core/execution_device_id.hpp b/cpp/include/raft/core/execution_device_id.hpp new file mode 100644 index 0000000000..36b63f17db --- /dev/null +++ b/cpp/include/raft/core/execution_device_id.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#ifdef CUML_ENABLE_GPU +#include +#endif +#include +#include + +namespace raft { +template +using execution_device_id = detail::device_id; + +using execution_device_id_variant = std::variant, execution_device_id>; +} diff --git a/cpp/include/raft/core/execution_stream.hpp b/cpp/include/raft/core/execution_stream.hpp new file mode 100644 index 0000000000..e2ce14fbb2 --- /dev/null +++ b/cpp/include/raft/core/execution_stream.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#ifdef CUML_ENABLE_GPU +#include +#endif + +namespace raft { +#ifdef CUML_ENABLE_GPU +using execution_stream = cudaStream_t; +#else +using execution_stream = int; +#endif +inline void synchronize(execution_stream stream) { +#ifdef CUML_ENABLE_GPU + cudaStreamSynchronize(stream); +#endif +} +} \ No newline at end of file diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 6588dc41d1..362cbc7f79 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,423 +14,358 @@ * limitations under the License. */ #pragma once - -#include -#include -#include - -#include -#include -#include -#include - -#include +#include "raft/core/memory_type.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace raft { +/** + * @brief A container which may or may not own its own data on host or device + * + */ +using index_type = std::size_t; +template +class buffer { + using value_type = T; -template > -using mdspan = std::experimental::mdspan; - -namespace detail { - -template -struct device_id { - using value_type = int; - - device_id(value_type device_index) {} - auto value() const { return value_type{}; } -}; - -template <> -struct device_id { - using value_type = int; - device_id() : id_{value_type{}} {}; - device_id(value_type dev_id) : id_{dev_id} {}; - - auto value() const noexcept { return id_; } - private: - value_type id_; -}; - -template<> -struct device_id { - using value_type = typename rmm::cuda_device_id::value_type; - device_id() noexcept(false) : id_{[](){ - auto raw_id = value_type{}; - RAFT_CUDA_CHECK(cudaGetDevice(&raw_id)); - return raw_id; - }()} {}; - device_id(value_type dev_id) noexcept : id_{dev_id} {}; - - auto value() const noexcept { return id_.value(); } - private: - rmm::cuda_device_id id_; -}; - -template -class non_owning_buffer { - using value_type = std::remove_const_t; - non_owning_buffer() : data_{nullptr} { } - - explicit non_owning_buffer(T* ptr) : data_{ptr} { } - - T* get() const { return data_; } + // using data_store = std::variant< + // non_owning_buffer, non_owning_buffer, owning_buffer, owning_buffer + // >; - private: - // TODO(wphicks): Back this with RMM-allocated host memory - T* data_; -}; + buffer() : buffer_type{}, size_{} {} -template -class owning_buffer { - owning_buffer() {} - owning_buffer(device_id device_id, std::size_t size, cuda_stream stream) {} - auto* get() const { return static_cast(nullptr); } + private: + execution_device_id_variant buffer_type; + index_type size_; + T* cached_ptr; }; -template -class owning_buffer { - // TODO(wphicks): Assess need for buffers of const T - using value_type = std::remove_const_t; - owning_buffer() : data_{} {} - - owning_buffer(device_id device_id, std::size_t size, cudaStream_t stream) noexcept(false) - : data_{[&device_id, &size, &stream]() { - auto device_context = device_setter{device_id}; - return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}}; + /** Construct non-initialized owning buffer */ +template +class buffer{ + buffer(index_type size, + device_type mem_type = device_type::cpu, + int device = 0, + execution_stream stream = 0) + : device_{[mem_type, &device]() { + auto result = {}; + switch (mem_type) { + case device_type::cpu: result = execution_device_id{device}; break; + case device_type::gpu: result = execution_device_id{device}; break; + } + return result; + }()}, + data_{[this, mem_type, size, stream]() { + auto result = data_store{}; + switch (mem_type) { + case device_type::cpu: + result = owning_buffer{size}; + break; + case device_type::gpu: + result = owning_buffer{std::get<1>(device_), size, stream}; + break; + } + return result; + }()}, + size_{size}, + cached_ptr {[this](){ + auto result = static_cast(nullptr); + switch(data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; }()} { } +} - auto* get() const { return reinterpret_cast(data_.data()); } - - private: - mutable rmm::device_buffer data_; -}; - -template -class buffer { - buffer() { + /** Construct non-owning buffer */ + buffer(T* input_data, + index_type size, + device_type mem_type = device_type::cpu, + int device = 0) + : device_{[mem_type, &device]() { + auto result = device_id_variant{}; + switch (mem_type) { + case device_type::cpu: + result = device_id{device}; + break; + case device_type::gpu: + result = device_id{device}; + break; + } + return result; + }()}, + data_{[this, input_data, mem_type]() { + auto result = data_store{}; + switch (mem_type) { + case device_type::cpu: + result = non_owning_buffer{input_data}; + break; + case device_type::gpu: + result = non_owning_buffer{input_data}; + break; + } + return result; + }()}, + size_{size}, + cached_ptr {[this](){ + auto result = static_cast(nullptr); + switch(data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} + { } - buffer(device_, data_, size_, cached_ptr) : - device_(device_), data_(data_), size_(size_), cached_ptr(cached_ptr) { + /** + * @brief Construct one buffer from another in the given memory location + * (either on host or on device) + * A buffer constructed in this way is owning and will copy the data from + * the original location + */ + buffer(buffer const& other, device_type mem_type, int device = 0, cuda_stream stream=cuda_stream{}) + : device_{[mem_type, &device]() { + auto result = device_id_variant{}; + switch (mem_type) { + case device_type::cpu: + result = device_id{device}; + break; + case device_type::gpu: + result = device_id{device}; + break; + } + return result; + }()}, + data_{[this, &other, mem_type, device, stream]() { + auto result = data_store{}; + auto result_data = static_cast(nullptr); + if (mem_type == device_type::cpu) { + auto buf = owning_buffer(other.size()); + result_data = buf.get(); + result = std::move(buf); + } else if (mem_type==device_type::gpu) { + auto buf = owning_buffer(std::get<1>(device_), other.size(), stream); + result_data = buf.get(); + result = std::move(buf); + } + copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); + return result; + }()}, + size_{other.size()}, + cached_ptr {[this](){ + auto result = static_cast(nullptr); + switch(data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} + { } - buffer(device_) - private: - device_id_variant device_; - data_store data_; - size_t size_; - T* cached_ptr; -}; - -// alignment fixed to 128 bytes -struct alignment { - static constexpr size_t value = 128; -}; - -} // namespace detail - -template -using layout_right_padded = std::experimental::layout_right_padded< - detail::padding>>::value>; - -template -using layout_left_padded = std::experimental::layout_left_padded< - detail::padding>>::value>; - -template -using enable_if_layout_padded = - std::enable_if_t>::value || - std::is_same>::value>; + /** + * @brief Create owning copy of existing buffer + * The memory type of this new buffer will be the same as the original + */ + buffer(buffer const& other) : buffer(other, other.memory_type(), other.device_index()) {} + friend void swap(buffer& first, buffer& second) { + using std::swap; + swap(first.device_, second.device_); + swap(first.data_, second.data_); + swap(first.size_, second.size_); + swap(first.cached_ptr, second.cached_ptr); + } + buffer& operator=(buffer other) { + swap(*this, other); + return *this; + } -/** - * Ensure all types listed in the parameter pack `Extents` are integral types. - * Usage: - * put it as the last nameless template parameter of a function: - * `typename = ensure_integral_extents` - */ -template -using ensure_integral_extents = std::enable_if_t...>>; + /** + * @brief Create owning copy of existing buffer with given stream + * The memory type of this new buffer will be the same as the original + */ + buffer(buffer const& other, cuda_stream stream) : buffer(other, other.memory_type(), other.device_index(), stream) {} + + /** + * @brief Move from existing buffer unless a copy is necessary based on + * memory location + */ + buffer(buffer&& other, device_type mem_type, int device, cuda_stream stream) + : device_{[mem_type, &device]() { + auto result = device_id_variant{}; + switch (mem_type) { + case device_type::cpu: + result = device_id{device}; + break; + case device_type::gpu: + result = device_id{device}; + break; + } + return result; + }()}, + data_{[&other, mem_type, device, stream]() { + auto result = data_store{}; + if (mem_type == other.memory_type() && device == other.device_index()) { + result = std::move(other.data_); + } else { + auto* result_data = static_cast(nullptr); + if (mem_type == device_type::cpu) { + auto buf = owning_buffer{other.size()}; + result_data = buf.get(); + result = std::move(buf); + } else if (mem_type == device_type::gpu) { + auto buf = owning_buffer{device, other.size(), stream}; + result_data = buf.get(); + result = std::move(buf); + } + copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); + } + return result; + }()}, + size_{other.size()}, + cached_ptr {[this](){ + auto result = static_cast(nullptr); + switch(data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} + { + } + buffer(buffer&& other, device_type mem_type, int device) + : buffer{std::move(other), mem_type, device, cuda_stream{}} + { + } + buffer(buffer&& other, device_type mem_type) + : buffer{std::move(other), mem_type, 0, cuda_stream{}} + { + } -/** - * @\brief Template checks and helpers to determine if type T is an std::mdspan - * or a derived type - */ + buffer(buffer&& other) : buffer{} { + swap(*this, other); + } -template -void __takes_an_mdspan_ptr(mdspan*); + template < + typename iter_t, + typename = decltype(*std::declval(), void(), ++std::declval(), void()) + > + buffer(iter_t const& begin, iter_t const& end) + : buffer{static_cast(std::distance(begin, end))} + { + auto index = std::size_t{}; + std::for_each(begin, end, [&index, this](auto&& val) { + data()[index++] = val; + }); + } -template -struct is_mdspan : std::false_type { -}; -template -struct is_mdspan()))>> - : std::true_type { -}; + template < + typename iter_t, + typename = decltype(*std::declval(), void(), ++std::declval(), void()) + > + buffer(iter_t const& begin, iter_t const& end, device_type mem_type) : buffer{buffer{begin, end}, mem_type} { } + + template < + typename iter_t, + typename = decltype(*std::declval(), void(), ++std::declval(), void()) + > + buffer(iter_t const& begin, iter_t const& end, device_type mem_type, int device, cuda_stream stream=cuda_stream{}) : buffer{buffer{begin, end}, mem_type, device, stream} { } + + auto size() const noexcept { return size_; } + HOST DEVICE auto* data() const noexcept { + return cached_ptr; + } + auto memory_type() const noexcept { + auto result = device_type{}; + if (device_.index() == 0) { + result = device_type::cpu; + } else { + result = device_type::gpu; + } + return result; + } -template -struct is_input_mdspan : std::false_type { -}; -template -struct is_input_mdspan()))>> - : std::bool_constant> { -}; + auto device() const noexcept { + return device_; + } -template -struct is_output_mdspan : std::false_type { -}; -template -struct is_output_mdspan()))>> - : std::bool_constant> { + auto device_index() const noexcept { + auto result = int{}; + switch(device_.index()) { + case 0: result = std::get<0>(device_).value(); break; + case 1: result = std::get<1>(device_).value(); break; + } + return result; + } + ~buffer() = default; }; -template -using is_mdspan_t = is_mdspan>; - -template -using is_input_mdspan_t = is_input_mdspan; - -template -using is_output_mdspan_t = is_output_mdspan; - -/** - * @\brief Boolean to determine if variadic template types Tn are either - * raft::host_mdspan/raft::device_mdspan or their derived types - */ -template -inline constexpr bool is_mdspan_v = std::conjunction_v...>; - -template -using enable_if_mdspan = std::enable_if_t>; - -template -inline constexpr bool is_input_mdspan_v = std::conjunction_v...>; - -template -using enable_if_input_mdspan = std::enable_if_t>; - -template -inline constexpr bool is_output_mdspan_v = std::conjunction_v...>; - -template -using enable_if_output_mdspan = std::enable_if_t>; - -// uint division optimization inspired by the CIndexer in cupy. Division operation is -// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64 -// bit when the index is smaller, then try to avoid division when it's exp of 2. -template -RAFT_INLINE_FUNCTION auto unravel_index_impl( - I idx, std::experimental::extents shape) -{ - constexpr auto kRank = static_cast(shape.rank()); - std::size_t index[shape.rank()]{0}; // NOLINT - static_assert(std::is_signed::value, - "Don't change the type without changing the for loop."); - for (int32_t dim = kRank; --dim > 0;) { - auto s = static_cast>>(shape.extent(dim)); - if (s & (s - 1)) { - auto t = idx / s; - index[dim] = idx - t * s; - idx = t; - } else { // exp of 2 - index[dim] = idx & (s - 1); - idx >>= detail::popc(s - 1); +template +const_agnostic_same_t copy(buffer& dst, buffer const& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, cuda_stream stream) { + if constexpr (bounds_check) { + if (src.size() - src_offset < size || dst.size() - dst_offset < size) { + throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } - index[0] = idx; - return detail::arr_to_tup(index); + copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); } -/** - * @brief Create a raft::mdspan - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @tparam is_host_accessible whether the data is accessible on host - * @tparam is_device_accessible whether the data is accessible on device - * @param ptr Pointer to the data - * @param exts dimensionality of the array (series of integers) - * @return raft::mdspan - */ -template -constexpr auto make_mdspan(ElementType* ptr, extents exts) -{ - using accessor_type = host_device_accessor< - std::experimental::default_accessor, - detail::memory_type_from_access()>; - /*using accessor_type = host_device_accessor, - mem_type>; */ - - return mdspan{ptr, exts}; +template +const_agnostic_same_t copy(buffer& dst, buffer const& src, cuda_stream stream) { + copy(dst, src, 0, 0, src.size(), stream); } - -/** - * @brief Create a layout_stride mapping from extents and strides - * @param[in] extents the dimensionality of the layout - * @param[in] strides the strides between elements in the layout - * @return raft::layout_stride::mapping - */ -template -auto make_strided_layout(Extents extents, Strides strides) -{ - return layout_stride::mapping{extents, strides}; +template +const_agnostic_same_t copy(buffer& dst, buffer const& src) { + copy(dst, src, 0, 0, src.size(), cuda_stream{}); } -/** - * @brief Create raft::extents to specify dimensionality - * - * @tparam IndexType The type of each dimension of the extents - * @tparam Extents Dimensions (a series of integers) - * @param exts The desired dimensions - * @return raft::extents - */ -template > -constexpr auto make_extents(Extents... exts) -{ - return extents{exts...}; -} - -/** - * @brief Flatten raft::mdspan into a 1-dim array view - * - * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan - * @param mds raft::host_mdspan or raft::device_mdspan object - * @return raft::host_mdspan or raft::device_mdspan with vector_extent - * depending on AccessoryPolicy - */ -template > -auto flatten(mdspan_type mds) -{ - RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); - - vector_extent ext{mds.size()}; - - return std::experimental::mdspan(mds.data_handle(), ext); -} - -/** - * @brief Reshape raft::host_mdspan or raft::device_mdspan - * - * @tparam mdspan_type Expected type raft::host_mdspan or raft::device_mdspan - * @tparam IndexType the index type of the extents - * @tparam Extents raft::extents for dimensions - * @param mds raft::host_mdspan or raft::device_mdspan object - * @param new_shape Desired new shape of the input - * @return raft::host_mdspan or raft::device_mdspan, depending on AccessorPolicy - */ -template > -auto reshape(mdspan_type mds, extents new_shape) -{ - RAFT_EXPECTS(mds.is_exhaustive(), "Input must be contiguous."); - - size_t new_size = 1; - for (size_t i = 0; i < new_shape.rank(); ++i) { - new_size *= new_shape.extent(i); - } - RAFT_EXPECTS(new_size == mds.size(), "Cannot reshape array with size mismatch"); - - return std::experimental::mdspan(mds.data_handle(), - new_shape); -} - -/** - * \brief Turns linear index into coordinate. Similar to numpy unravel_index. - * - * \code - * auto m = make_host_matrix(7, 6); - * auto m_v = m.view(); - * auto coord = unravel_index(2, m.extents(), typename decltype(m)::layout_type{}); - * std::apply(m_v, coord) = 2; - * \endcode - * - * \param idx The linear index. - * \param shape The shape of the array to use. - * \param layout Must be `layout_c_contiguous` (row-major) in current implementation. - * - * \return A std::tuple that represents the coordinate. - */ -template -RAFT_INLINE_FUNCTION auto unravel_index(Idx idx, - extents shape, - LayoutPolicy const& layout) -{ - static_assert(std::is_same_v>, - layout_c_contiguous>, - "Only C layout is supported."); - static_assert(std::is_integral_v, "Index must be integral."); - auto constexpr kIs64 = sizeof(std::remove_cv_t>) == sizeof(uint64_t); - if (kIs64 && static_cast(idx) > std::numeric_limits::max()) { - return unravel_index_impl(static_cast(idx), shape); - } else { - return unravel_index_impl(static_cast(idx), shape); +template +const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, cuda_stream stream) { + if constexpr (bounds_check) { + if (src.size() - src_offset < size || dst.size() - dst_offset < size) { + throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); + } } + copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); } -/** - * @brief Const accessor specialization for default_accessor - * - * @tparam ElementType - * @param a - * @return std::experimental::default_accessor> - */ -template -std::experimental::default_accessor> accessor_of_const( - std::experimental::default_accessor a) -{ - return {a}; +template +const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, cuda_stream stream) { + copy(dst, src, dst_offset, 0, src.size(), stream); } -/** - * @brief Const accessor specialization for host_device_accessor - * - * @tparam ElementType the data type of the mdspan elements - * @tparam MemType the type of memory where the elements are stored. - * @param a host_device_accessor - * @return host_device_accessor>, - * MemType> - */ -template -host_device_accessor>, MemType> -accessor_of_const(host_device_accessor, MemType> a) -{ - return {a}; +template +const_agnostic_same_t copy(buffer&& dst, buffer&& src, cuda_stream stream) { + copy(dst, src, 0, 0, src.size(), stream); } - -/** - * @brief Create a copy of the given mdspan with const element type - * - * @tparam ElementType the const-qualified data type of the mdspan elements - * @tparam Extents raft::extents for dimensions - * @tparam Layout policy for strides and layout ordering - * @tparam Accessor Accessor policy for the input and output - * @param mds raft::mdspan object - * @return raft::mdspan - */ -template -auto make_const_mdspan(mdspan mds) -{ - auto acc_c = accessor_of_const(mds.accessor()); - return mdspan, Extents, Layout, decltype(acc_c)>{ - mds.data_handle(), mds.mapping(), acc_c}; +template +const_agnostic_same_t copy(buffer&& dst, buffer&& src) { + copy(dst, src, 0, 0, src.size(), cuda_stream{}); } -} // namespace raft +} // namespace raft_proto \ No newline at end of file From 21c264113ebc062eaab897e201486a8bab4ea09d Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 6 Apr 2023 14:38:22 -0700 Subject: [PATCH 03/75] Update --- cpp/include/raft/core/buffer_copy.hpp | 36 ++-- .../core/detail/buffer_utils/copy_cpu.hpp | 4 +- .../core/detail/buffer_utils/copy_gpu.hpp | 2 +- .../detail/buffer_utils/owning_buffer.hpp | 2 +- .../detail/buffer_utils/owning_buffer_cpu.hpp | 1 + .../raft/core/detail/device_setter_gpu.hpp | 8 +- .../core/detail/execution_device_id_base.hpp | 4 +- .../core/detail/execution_device_id_cpu.hpp | 8 +- .../core/detail/execution_device_id_gpu.hpp | 8 +- cpp/include/raft/core/device_setter.hpp | 2 +- cpp/include/raft/core/device_support.hpp | 18 ++ cpp/include/raft/core/device_type.hpp | 7 + cpp/include/raft/core/exceptions.hpp | 12 +- cpp/include/raft/core/execution_device_id.hpp | 4 +- cpp/include/raft/core/execution_stream.hpp | 6 +- cpp/include/raft/core/mdbuffer.hpp | 161 ++++++++---------- 16 files changed, 147 insertions(+), 136 deletions(-) diff --git a/cpp/include/raft/core/buffer_copy.hpp b/cpp/include/raft/core/buffer_copy.hpp index 1595219a7f..741015139f 100644 --- a/cpp/include/raft/core/buffer_copy.hpp +++ b/cpp/include/raft/core/buffer_copy.hpp @@ -17,7 +17,7 @@ #include #include #include -#ifdef CUML_ENABLE_GPU +#ifndef RAFT_DISABLE_CUDA #include #endif #include @@ -25,46 +25,46 @@ namespace raft { template -void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) { - buffer::detail::copy(dst + dst_offset, src + src_offset, size, execution_stream{}); +void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) { + detail::buffer_copy(dst + dst_offset, src + src_offset, size, execution_stream{}); } template -void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { - buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); +void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { + detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); } template -void copy(T* dst, T const* src, uint32_t size) { - buffer::detail::copy(dst, src, size, execution_stream{}); +void buffer_copy(T* dst, T const* src, uint32_t size) { + detail::buffer_copy(dst, src, size, execution_stream{}); } template -void copy(T* dst, T const* src, uint32_t size, execution_stream stream) { - buffer::detail::copy(dst, src, size, stream); +void buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { + detail::buffer_copy(dst, src, size, stream); } template -void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { +void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { if (dst_type == device_type::gpu && src_type == device_type::gpu) { - buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { - buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { - buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { - buffer::detail::copy(dst + dst_offset, src + src_offset, size, stream); + detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); } } template -void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) { - copy(dst, src, size, dst_type, src_type, 0, 0, execution_stream{}); +void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) { + detail::buffer_copy(dst, src, size, dst_type, src_type, 0, 0, execution_stream{}); } template -void copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, execution_stream stream) { - copy(dst, src, size, dst_type, src_type, 0, 0, stream); +void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, execution_stream stream) { + detail::buffer_copy(dst, src, size, dst_type, src_type, 0, 0, stream); } } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp index 295909d37b..272c589b4f 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp @@ -24,12 +24,12 @@ namespace raft { namespace detail { template -std::enable_if_t, std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, execution_stream stream) { +std::enable_if_t, std::bool_constant>, void> buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { std::copy(src, src + size, dst); } template -std::enable_if_t, std::bool_constant>, std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, execution_stream stream) { +std::enable_if_t, std::bool_constant>, std::bool_constant>, void> buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { throw raft::cuda_unsupported("Copying from or to device in non-GPU build"); } diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp index 25f692517d..f12998d8c4 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -28,7 +28,7 @@ namespace raft { namespace detail { template -std::enable_if_t, std::bool_constant>, std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) { +std::enable_if_t, std::bool_constant>, std::bool_constant>, void> buffer_copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) { RAFT_CUDA_TRY(thrust::copy(rmm::exec_policy(stream), src, src + size, dst)); } diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp index 1d44de6aad..f9531ab21f 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp @@ -16,7 +16,7 @@ #pragma once #include #include "owning_buffer_cpu.hpp" -#ifdef CUML_ENABLE_GPU +#ifndef RAFT_DISABLE_CUDA #include "owning_buffer_gpu.hpp" #endif namespace raft { diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index a4951cd20e..a70ff60ce1 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -18,6 +18,7 @@ #include #include "owning_buffer_base.hpp" #include +#include namespace raft { namespace detail { diff --git a/cpp/include/raft/core/detail/device_setter_gpu.hpp b/cpp/include/raft/core/detail/device_setter_gpu.hpp index 300fcf766b..1468aaae6f 100644 --- a/cpp/include/raft/core/detail/device_setter_gpu.hpp +++ b/cpp/include/raft/core/detail/device_setter_gpu.hpp @@ -14,8 +14,8 @@ * limitations under the License. */ #pragma once +#include "raft/util/cuda_rt_essentials.hpp" #include -// #include #include #include #include @@ -29,17 +29,17 @@ template <> class device_setter { device_setter(raft::execution_device_id device) noexcept(false) : prev_device_{[]() { auto result = int{}; - raft::cuda_check(cudaGetDevice(&result)); + RAFT_CUDA_TRY(cudaGetDevice(&result)); return result; }()} { - raft::cuda_check(cudaSetDevice(device.value())); + RAFT_CUDA_TRY(cudaSetDevice(device.value())); } ~device_setter() { RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); } private: - device_id prev_device_; + execution_device_id prev_device_; }; } diff --git a/cpp/include/raft/core/detail/execution_device_id_base.hpp b/cpp/include/raft/core/detail/execution_device_id_base.hpp index 6af2106771..2e9d13a6e2 100644 --- a/cpp/include/raft/core/detail/execution_device_id_base.hpp +++ b/cpp/include/raft/core/detail/execution_device_id_base.hpp @@ -19,10 +19,10 @@ namespace raft { namespace detail { template -class device_id { +struct execution_device_id { using value_type = int; - device_id(value_type device_index) {} + execution_device_id(value_type device_index) {} auto value() const { return value_type{}; } }; } diff --git a/cpp/include/raft/core/detail/execution_device_id_cpu.hpp b/cpp/include/raft/core/detail/execution_device_id_cpu.hpp index 0892982eff..d9317bc51f 100644 --- a/cpp/include/raft/core/detail/execution_device_id_cpu.hpp +++ b/cpp/include/raft/core/detail/execution_device_id_cpu.hpp @@ -14,16 +14,16 @@ * limitations under the License. */ #pragma once -#include +#include "execution_device_id_base.hpp" #include namespace raft { namespace detail { template <> -class device_id { +struct execution_device_id { using value_type = int; - device_id() : id_{value_type{}} {}; - device_id(value_type dev_id) : id_{dev_id} {}; + execution_device_id() : id_{value_type{}} {}; + execution_device_id(value_type dev_id) : id_{dev_id} {}; auto value() const noexcept { return id_; } private: diff --git a/cpp/include/raft/core/detail/execution_device_id_gpu.hpp b/cpp/include/raft/core/detail/execution_device_id_gpu.hpp index 27015bc92f..771c0b0b5c 100644 --- a/cpp/include/raft/core/detail/execution_device_id_gpu.hpp +++ b/cpp/include/raft/core/detail/execution_device_id_gpu.hpp @@ -14,7 +14,7 @@ * limitations under the License. */ #pragma once -#include +#include "execution_device_id_base.hpp" #include #include #include @@ -22,9 +22,9 @@ namespace raft { namespace detail { template <> -class device_id { +struct execution_device_id { using value_type = typename rmm::cuda_device_id::value_type; - device_id() noexcept(false) + execution_device_id() noexcept(false) : id_{[]() { auto raw_id = value_type{}; RAFT_CUDA_TRY(cudaGetDevice(&raw_id)); @@ -38,7 +38,7 @@ class device_id { * attached to this value and we can easily convert to the strongly-typed * rmm::cuda_device_id if desired. */ - device_id(value_type dev_id) noexcept : id_{dev_id} {}; + execution_device_id(value_type dev_id) noexcept : id_{dev_id} {}; auto value() const noexcept { return id_.value(); } auto rmm_id() const noexcept { return id_; } diff --git a/cpp/include/raft/core/device_setter.hpp b/cpp/include/raft/core/device_setter.hpp index 4f915ae59c..badf7ae7fc 100644 --- a/cpp/include/raft/core/device_setter.hpp +++ b/cpp/include/raft/core/device_setter.hpp @@ -15,7 +15,7 @@ */ #pragma once #include -#ifdef CUML_ENABLE_GPU +#ifndef RAFT_DISABLE_CUDA #include #endif #include diff --git a/cpp/include/raft/core/device_support.hpp b/cpp/include/raft/core/device_support.hpp index f7ab1d7e5a..1bb58195d7 100644 --- a/cpp/include/raft/core/device_support.hpp +++ b/cpp/include/raft/core/device_support.hpp @@ -24,6 +24,24 @@ auto constexpr static const CUDA_ENABLED = false; auto constexpr static const CUDA_ENABLED = true; #endif +#ifdef __CUDACC__ +#define HOST __host__ +#define DEVICE __device__ +auto constexpr static const GPU_COMPILATION = true; +#else +#define HOST +#define DEVICE +auto constexpr static const GPU_COMPILATION = false; +#endif + +#ifndef DEBUG +auto constexpr static const DEBUG_ENABLED = false; +#elif DEBUG == 0 +auto constexpr static const DEBUG_ENABLED = false; +#else +auto constexpr static const DEBUG_ENABLED = true; +#endif + struct cuda_unsupported : raft::exception { explicit cuda_unsupported(std::string const& msg) : raft::exception{msg} {} cuda_unsupported() : cuda_unsupported{"CUDA functionality invoked in non-CUDA build"} {} diff --git a/cpp/include/raft/core/device_type.hpp b/cpp/include/raft/core/device_type.hpp index 94a8f88dc1..11938e8032 100644 --- a/cpp/include/raft/core/device_type.hpp +++ b/cpp/include/raft/core/device_type.hpp @@ -14,9 +14,16 @@ * limitations under the License. */ #pragma once +#include namespace raft { enum class device_type { cpu, gpu }; + +auto constexpr is_compatible(device_type dev_type, memory_type mem_type) +{ + return (dev_type == device_type::gpu && is_device_accessible(mem_type)) || + (dev_type == device_type::cpu && is_host_accessible(mem_type)); +} } \ No newline at end of file diff --git a/cpp/include/raft/core/exceptions.hpp b/cpp/include/raft/core/exceptions.hpp index 3fe18a2d73..39afdce567 100644 --- a/cpp/include/raft/core/exceptions.hpp +++ b/cpp/include/raft/core/exceptions.hpp @@ -14,10 +14,10 @@ * limitations under the License. */ #pragma once -#include +#include namespace raft { -struct bad_cuda_call : std::exception { +struct bad_cuda_call : raft::exception { bad_cuda_call() : bad_cuda_call("CUDA API call failed") {} bad_cuda_call(char const* msg) : msg_{msg} {} virtual char const* what() const noexcept { return msg_; } @@ -26,7 +26,7 @@ struct bad_cuda_call : std::exception { char const* msg_; }; -struct out_of_bounds : std::exception { +struct out_of_bounds : raft::exception { out_of_bounds() : out_of_bounds("Attempted out-of-bounds memory access") {} out_of_bounds(char const* msg) : msg_{msg} {} virtual char const* what() const noexcept { return msg_; } @@ -35,7 +35,7 @@ struct out_of_bounds : std::exception { char const* msg_; }; -struct wrong_device_type : std::exception { +struct wrong_device_type : raft::exception { wrong_device_type() : wrong_device_type( "Attempted to use host data on GPU or device data on CPU" ) {} @@ -46,7 +46,7 @@ struct wrong_device_type : std::exception { char const* msg_; }; -struct mem_type_mismatch : std::exception { +struct mem_type_mismatch : raft::exception { mem_type_mismatch() : mem_type_mismatch( "Memory type does not match expected type" ) {} @@ -57,7 +57,7 @@ struct mem_type_mismatch : std::exception { char const* msg_; }; -struct wrong_device : std::exception { +struct wrong_device : raft::exception { wrong_device() : wrong_device( "Attempted to use incorrect device" ) {} diff --git a/cpp/include/raft/core/execution_device_id.hpp b/cpp/include/raft/core/execution_device_id.hpp index 36b63f17db..dedc4b5518 100644 --- a/cpp/include/raft/core/execution_device_id.hpp +++ b/cpp/include/raft/core/execution_device_id.hpp @@ -17,7 +17,7 @@ #include #include -#ifdef CUML_ENABLE_GPU +#ifndef RAFT_DISABLE_CUDA #include #endif #include @@ -25,7 +25,7 @@ namespace raft { template -using execution_device_id = detail::device_id; +using execution_device_id = detail::execution_device_id; using execution_device_id_variant = std::variant, execution_device_id>; } diff --git a/cpp/include/raft/core/execution_stream.hpp b/cpp/include/raft/core/execution_stream.hpp index e2ce14fbb2..e319dc866f 100644 --- a/cpp/include/raft/core/execution_stream.hpp +++ b/cpp/include/raft/core/execution_stream.hpp @@ -14,18 +14,18 @@ * limitations under the License. */ #pragma once -#ifdef CUML_ENABLE_GPU +#ifndef RAFT_DISABLE_CUDA #include #endif namespace raft { -#ifdef CUML_ENABLE_GPU +#ifndef RAFT_DISABLE_CUDA using execution_stream = cudaStream_t; #else using execution_stream = int; #endif inline void synchronize(execution_stream stream) { -#ifdef CUML_ENABLE_GPU +#ifndef RAFT_DISABLE_CUDA cudaStreamSynchronize(stream); #endif } diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 362cbc7f79..5fd7c3a1da 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -14,7 +14,7 @@ * limitations under the License. */ #pragma once -#include "raft/core/memory_type.hpp" +#include #include #include #include @@ -28,8 +28,8 @@ #include #include #include -#include -#include +#include +#include namespace raft { /** @@ -37,46 +37,37 @@ namespace raft { * */ using index_type = std::size_t; -template -class buffer { +template +struct buffer { + using index_type = std::size_t; using value_type = T; - // using data_store = std::variant< - // non_owning_buffer, non_owning_buffer, owning_buffer, owning_buffer - // >; - - buffer() : buffer_type{}, size_{} {} + using data_store = std::variant< + detail::non_owning_buffer, detail::non_owning_buffer, detail::owning_buffer, detail::owning_buffer + >; - private: - execution_device_id_variant buffer_type; - index_type size_; - T* cached_ptr; -}; + buffer() : device_{}, data_{}, size_{} {} /** Construct non-initialized owning buffer */ -template -class buffer{ buffer(index_type size, - device_type mem_type = device_type::cpu, + memory_type mem_type = memory_type::host, int device = 0, - execution_stream stream = 0) + execution_stream stream = 0) : device_{[mem_type, &device]() { - auto result = {}; - switch (mem_type) { - case device_type::cpu: result = execution_device_id{device}; break; - case device_type::gpu: result = execution_device_id{device}; break; + auto result = execution_device_id_variant{}; + if (is_device_accessible(mem_type)) { + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; } return result; }()}, data_{[this, mem_type, size, stream]() { auto result = data_store{}; - switch (mem_type) { - case device_type::cpu: - result = owning_buffer{size}; - break; - case device_type::gpu: - result = owning_buffer{std::get<1>(device_), size, stream}; - break; + if (is_device_accessible(mem_type)) { + result = detail::owning_buffer{std::get<1>(device_), size, stream}; + } else { + result = detail::owning_buffer{size}; } return result; }()}, @@ -93,34 +84,27 @@ class buffer{ }()} { } -} /** Construct non-owning buffer */ buffer(T* input_data, index_type size, - device_type mem_type = device_type::cpu, + memory_type mem_type = memory_type::host, int device = 0) : device_{[mem_type, &device]() { - auto result = device_id_variant{}; - switch (mem_type) { - case device_type::cpu: - result = device_id{device}; - break; - case device_type::gpu: - result = device_id{device}; - break; + auto result = execution_device_id_variant{}; + if (is_device_accessible(mem_type)) { + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; } return result; }()}, data_{[this, input_data, mem_type]() { auto result = data_store{}; - switch (mem_type) { - case device_type::cpu: - result = non_owning_buffer{input_data}; - break; - case device_type::gpu: - result = non_owning_buffer{input_data}; - break; + if (is_device_accessible(mem_type)) { + result = detail::non_owning_buffer{input_data}; + } else { + result = detail::non_owning_buffer{input_data}; } return result; }()}, @@ -144,28 +128,25 @@ class buffer{ * A buffer constructed in this way is owning and will copy the data from * the original location */ - buffer(buffer const& other, device_type mem_type, int device = 0, cuda_stream stream=cuda_stream{}) + buffer(buffer const& other, memory_type mem_type, int device = 0, execution_stream stream=execution_stream{}) : device_{[mem_type, &device]() { - auto result = device_id_variant{}; - switch (mem_type) { - case device_type::cpu: - result = device_id{device}; - break; - case device_type::gpu: - result = device_id{device}; - break; + auto result = execution_device_id_variant{}; + if (is_device_accessible(mem_type)) { + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; } return result; }()}, data_{[this, &other, mem_type, device, stream]() { auto result = data_store{}; auto result_data = static_cast(nullptr); - if (mem_type == device_type::cpu) { - auto buf = owning_buffer(other.size()); + if (is_device_accessible(mem_type)) { + auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); - } else if (mem_type==device_type::gpu) { - auto buf = owning_buffer(std::get<1>(device_), other.size(), stream); + } else { + auto buf = detail::owning_buffer(std::get<1>(device_), other.size(), stream); result_data = buf.get(); result = std::move(buf); } @@ -207,22 +188,19 @@ class buffer{ * @brief Create owning copy of existing buffer with given stream * The memory type of this new buffer will be the same as the original */ - buffer(buffer const& other, cuda_stream stream) : buffer(other, other.memory_type(), other.device_index(), stream) {} + buffer(buffer const& other, execution_stream stream) : buffer(other, other.memory_type(), other.device_index(), stream) {} /** * @brief Move from existing buffer unless a copy is necessary based on * memory location */ - buffer(buffer&& other, device_type mem_type, int device, cuda_stream stream) + buffer(buffer&& other, memory_type mem_type, int device, execution_stream stream) : device_{[mem_type, &device]() { - auto result = device_id_variant{}; - switch (mem_type) { - case device_type::cpu: - result = device_id{device}; - break; - case device_type::gpu: - result = device_id{device}; - break; + auto result = execution_device_id_variant{}; + if (is_device_accessible(mem_type)) { + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; } return result; }()}, @@ -232,12 +210,13 @@ class buffer{ result = std::move(other.data_); } else { auto* result_data = static_cast(nullptr); - if (mem_type == device_type::cpu) { - auto buf = owning_buffer{other.size()}; + if (is_device_accessible(mem_type)) { + auto buf = detail::owning_buffer{device, other.size(), stream}; result_data = buf.get(); result = std::move(buf); - } else if (mem_type == device_type::gpu) { - auto buf = owning_buffer{device, other.size(), stream}; + } + else { + auto buf = detail::owning_buffer{other.size()}; result_data = buf.get(); result = std::move(buf); } @@ -259,11 +238,11 @@ class buffer{ { } buffer(buffer&& other, device_type mem_type, int device) - : buffer{std::move(other), mem_type, device, cuda_stream{}} + : buffer{std::move(other), mem_type, device, execution_stream{}} { } buffer(buffer&& other, device_type mem_type) - : buffer{std::move(other), mem_type, 0, cuda_stream{}} + : buffer{std::move(other), mem_type, 0, execution_stream{}} { } @@ -294,14 +273,14 @@ class buffer{ typename iter_t, typename = decltype(*std::declval(), void(), ++std::declval(), void()) > - buffer(iter_t const& begin, iter_t const& end, device_type mem_type, int device, cuda_stream stream=cuda_stream{}) : buffer{buffer{begin, end}, mem_type, device, stream} { } + buffer(iter_t const& begin, iter_t const& end, device_type mem_type, int device, execution_stream stream=execution_stream{}) : buffer{buffer{begin, end}, mem_type, device, stream} { } auto size() const noexcept { return size_; } HOST DEVICE auto* data() const noexcept { return cached_ptr; } - auto memory_type() const noexcept { - auto result = device_type{}; + auto device_type() const noexcept { + enum device_type result; if (device_.index() == 0) { result = device_type::cpu; } else { @@ -323,10 +302,16 @@ class buffer{ return result; } ~buffer() = default; + + private: + execution_device_id_variant device_; + data_store data_; + index_type size_; + T* cached_ptr; }; template -const_agnostic_same_t copy(buffer& dst, buffer const& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, cuda_stream stream) { +detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, execution_stream stream) { if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); @@ -336,16 +321,16 @@ const_agnostic_same_t copy(buffer& dst, buffer const& src, typename } template -const_agnostic_same_t copy(buffer& dst, buffer const& src, cuda_stream stream) { +detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, execution_stream stream) { copy(dst, src, 0, 0, src.size(), stream); } template -const_agnostic_same_t copy(buffer& dst, buffer const& src) { - copy(dst, src, 0, 0, src.size(), cuda_stream{}); +detail::const_agnostic_same_t copy(buffer& dst, buffer const& src) { + copy(dst, src, 0, 0, src.size(), execution_stream{}); } template -const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, cuda_stream stream) { +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, execution_stream stream) { if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); @@ -355,17 +340,17 @@ const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buff } template -const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, cuda_stream stream) { +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, execution_stream stream) { copy(dst, src, dst_offset, 0, src.size(), stream); } template -const_agnostic_same_t copy(buffer&& dst, buffer&& src, cuda_stream stream) { +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, execution_stream stream) { copy(dst, src, 0, 0, src.size(), stream); } template -const_agnostic_same_t copy(buffer&& dst, buffer&& src) { - copy(dst, src, 0, 0, src.size(), cuda_stream{}); +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src) { + copy(dst, src, 0, 0, src.size(), execution_stream{}); } } // namespace raft_proto \ No newline at end of file From ea11b070d9addc737fa3ead2d20c62a94196b447 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 6 Apr 2023 14:58:37 -0700 Subject: [PATCH 04/75] Merge --- .github/workflows/test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 11ff3333d1..a18bb387f6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -51,6 +51,6 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} package-name: raft_dask - test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04" - test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04" - test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test" + test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" + test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" + test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test" \ No newline at end of file From ab19410367429eecc3c76f356f24054061a97fca Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 6 Apr 2023 18:02:59 -0700 Subject: [PATCH 05/75] build --- .../raft/core/{mdbuffer.hpp => buffer.hpp} | 18 +++++++------- .../{ => detail/buffer_utils}/buffer_copy.hpp | 24 +++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) rename cpp/include/raft/core/{mdbuffer.hpp => buffer.hpp} (94%) rename cpp/include/raft/core/{ => detail/buffer_utils}/buffer_copy.hpp (70%) diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/buffer.hpp similarity index 94% rename from cpp/include/raft/core/mdbuffer.hpp rename to cpp/include/raft/core/buffer.hpp index 5fd7c3a1da..8e50bb0e31 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -317,16 +317,16 @@ detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, t throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } - copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); + detail::buffer_copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); } template detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, execution_stream stream) { - copy(dst, src, 0, 0, src.size(), stream); + detail::buffer_copy(dst, src, 0, 0, src.size(), stream); } template detail::const_agnostic_same_t copy(buffer& dst, buffer const& src) { - copy(dst, src, 0, 0, src.size(), execution_stream{}); + detail::buffer_copy(dst, src, 0, 0, src.size(), execution_stream{}); } template @@ -336,21 +336,21 @@ detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, typen throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } - copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); + detail::buffer_copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); } template detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, execution_stream stream) { - copy(dst, src, dst_offset, 0, src.size(), stream); + detail::buffer_copy(dst, src, dst_offset, 0, src.size(), stream); } template detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, execution_stream stream) { - copy(dst, src, 0, 0, src.size(), stream); + detail::buffer_copy(dst, src, 0, 0, src.size(), stream); } template detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src) { - copy(dst, src, 0, 0, src.size(), execution_stream{}); + detail::buffer_copy(dst, src, 0, 0, src.size(), execution_stream{}); } -} // namespace raft_proto \ No newline at end of file +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/buffer_copy.hpp b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp similarity index 70% rename from cpp/include/raft/core/buffer_copy.hpp rename to cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp index 741015139f..92bb674f3a 100644 --- a/cpp/include/raft/core/buffer_copy.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp @@ -23,48 +23,48 @@ #include namespace raft { - +namespace detail { template void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) { - detail::buffer_copy(dst + dst_offset, src + src_offset, size, execution_stream{}); + buffer_copy(dst + dst_offset, src + src_offset, size, execution_stream{}); } template void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { - detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy(dst + dst_offset, src + src_offset, size, stream); } template void buffer_copy(T* dst, T const* src, uint32_t size) { - detail::buffer_copy(dst, src, size, execution_stream{}); + buffer_copy(dst, src, size, execution_stream{}); } template void buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { - detail::buffer_copy(dst, src, size, stream); + buffer_copy(dst, src, size, stream); } template void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { if (dst_type == device_type::gpu && src_type == device_type::gpu) { - detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { - detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { - detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { - detail::buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy(dst + dst_offset, src + src_offset, size, stream); } } template void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) { - detail::buffer_copy(dst, src, size, dst_type, src_type, 0, 0, execution_stream{}); + buffer_copy(dst, src, size, dst_type, src_type, 0, 0, execution_stream{}); } template void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, execution_stream stream) { - detail::buffer_copy(dst, src, size, dst_type, src_type, 0, 0, stream); + buffer_copy(dst, src, size, dst_type, src_type, 0, 0, stream); } - +} // namespace detail } // namespace raft \ No newline at end of file From 9870e9dc43052070db90cab981d59a1e47731352 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 7 Apr 2023 12:03:56 -0700 Subject: [PATCH 06/75] Test start --- cpp/test/core/buffer.cu | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 cpp/test/core/buffer.cu diff --git a/cpp/test/core/buffer.cu b/cpp/test/core/buffer.cu new file mode 100644 index 0000000000..e69de29bb2 From 51a25818ae22fb72a0ea684268140991abd4de04 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 7 Apr 2023 12:04:21 -0700 Subject: [PATCH 07/75] Test start --- cpp/test/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 9109d84fe4..4ce4b96c41 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -93,6 +93,7 @@ if(BUILD_TESTS) NAME CORE_TEST PATH + test/core/buffer.cu test/core/logger.cpp test/core/math_device.cu test/core/math_host.cpp From d0e7b2cfb47ffe1604c40c045750fc9711b52d52 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 7 Apr 2023 12:28:26 -0700 Subject: [PATCH 08/75] style changes --- .../all_cuda-118_arch-x86_64.yaml | 2 +- cpp/CMakeLists.txt | 16 +- cpp/include/raft/core/buffer.hpp | 415 ++++++++++-------- .../core/detail/buffer_utils/buffer_copy.hpp | 70 ++- .../core/detail/buffer_utils/copy_cpu.hpp | 24 +- .../core/detail/buffer_utils/copy_gpu.hpp | 20 +- .../detail/buffer_utils/non_owning_buffer.hpp | 12 +- .../detail/buffer_utils/owning_buffer.hpp | 8 +- .../buffer_utils/owning_buffer_base.hpp | 10 +- .../detail/buffer_utils/owning_buffer_cpu.hpp | 20 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 22 +- .../raft/core/detail/const_agnostic.hpp | 2 +- .../raft/core/detail/device_setter_base.hpp | 4 +- .../raft/core/detail/device_setter_gpu.hpp | 23 +- .../core/detail/execution_device_id_base.hpp | 6 +- .../core/detail/execution_device_id_cpu.hpp | 5 +- .../core/detail/execution_device_id_gpu.hpp | 2 +- cpp/include/raft/core/device_support.hpp | 8 +- cpp/include/raft/core/device_type.hpp | 7 +- cpp/include/raft/core/exceptions.hpp | 16 +- cpp/include/raft/core/execution_device_id.hpp | 5 +- cpp/include/raft/core/execution_stream.hpp | 5 +- python/raft-dask/pyproject.toml | 1 - 23 files changed, 390 insertions(+), 313 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 4ab9d95675..0e06076f1a 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -54,4 +54,4 @@ dependencies: - ucx-proc=*=gpu - ucx-py==0.32.* - ucx>=1.13.0 -name: all_cuda-118_arch-x86_64 \ No newline at end of file +name: all_cuda-118_arch-x86_64 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 144f58c4d6..1355b77875 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -70,13 +70,11 @@ option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations" ${RAFT_COMPILE_LIBRARY_DEFAULT} ) - -# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs -# to have different values for the `Threads::Threads` target. Setting this flag ensures +# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to +# have different values for the `Threads::Threads` target. Setting this flag ensures # `Threads::Threads` is the same value across all builds so that cache hits occur set(THREADS_PREFER_PTHREAD_FLAG ON) - include(CMakeDependentOption) # cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for # nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARY OFF ) @@ -612,7 +610,9 @@ rapids_export( COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets} GLOBAL_TARGETS raft compiled distributed - NAMESPACE raft:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string + NAMESPACE raft:: + DOCUMENTATION doc_string + FINAL_CODE_BLOCK code_string ) # ################################################################################################## @@ -622,8 +622,10 @@ rapids_export( EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets} - GLOBAL_TARGETS raft - compiled distributed DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string + GLOBAL_TARGETS raft compiled distributed + DOCUMENTATION doc_string + NAMESPACE raft:: + FINAL_CODE_BLOCK code_string ) # ################################################################################################## diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index 8e50bb0e31..34f13bf3c7 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -14,22 +14,22 @@ * limitations under the License. */ #pragma once -#include #include -#include #include #include -#include -#include -#include #include -#include #include #include -#include +#include +#include #include #include -#include +#include +#include +#include +#include +#include +#include namespace raft { /** @@ -37,88 +37,86 @@ namespace raft { * */ using index_type = std::size_t; -template +template struct buffer { using index_type = std::size_t; using value_type = T; - using data_store = std::variant< - detail::non_owning_buffer, detail::non_owning_buffer, detail::owning_buffer, detail::owning_buffer - >; + using data_store = std::variant, + detail::non_owning_buffer, + detail::owning_buffer, + detail::owning_buffer>; buffer() : device_{}, data_{}, size_{} {} /** Construct non-initialized owning buffer */ buffer(index_type size, - memory_type mem_type = memory_type::host, - int device = 0, - execution_stream stream = 0) + memory_type mem_type = memory_type::host, + int device = 0, + execution_stream stream = 0) : device_{[mem_type, &device]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; - }()}, - data_{[this, mem_type, size, stream]() { - auto result = data_store{}; - if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{std::get<1>(device_), size, stream}; - } else { - result = detail::owning_buffer{size}; - } - return result; - }()}, - size_{size}, - cached_ptr {[this](){ - auto result = static_cast(nullptr); - switch(data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - } - return result; - }()} + auto result = execution_device_id_variant{}; + if (is_device_accessible(mem_type)) { + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; + } + return result; + }()}, + data_{[this, mem_type, size, stream]() { + auto result = data_store{}; + if (is_device_accessible(mem_type)) { + result = detail::owning_buffer{std::get<1>(device_), size, stream}; + } else { + result = detail::owning_buffer{size}; + } + return result; + }()}, + size_{size}, + cached_ptr{[this]() { + auto result = static_cast(nullptr); + switch (data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} { } /** Construct non-owning buffer */ - buffer(T* input_data, - index_type size, - memory_type mem_type = memory_type::host, - int device = 0) + buffer(T* input_data, index_type size, memory_type mem_type = memory_type::host, int device = 0) : device_{[mem_type, &device]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; - }()}, - data_{[this, input_data, mem_type]() { - auto result = data_store{}; - if (is_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data}; - } else { - result = detail::non_owning_buffer{input_data}; - } - return result; - }()}, - size_{size}, - cached_ptr {[this](){ - auto result = static_cast(nullptr); - switch(data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - } - return result; - }()} + auto result = execution_device_id_variant{}; + if (is_device_accessible(mem_type)) { + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; + } + return result; + }()}, + data_{[this, input_data, mem_type]() { + auto result = data_store{}; + if (is_device_accessible(mem_type)) { + result = detail::non_owning_buffer{input_data}; + } else { + result = detail::non_owning_buffer{input_data}; + } + return result; + }()}, + size_{size}, + cached_ptr{[this]() { + auto result = static_cast(nullptr); + switch (data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} { } @@ -128,42 +126,46 @@ struct buffer { * A buffer constructed in this way is owning and will copy the data from * the original location */ - buffer(buffer const& other, memory_type mem_type, int device = 0, execution_stream stream=execution_stream{}) + buffer(buffer const& other, + memory_type mem_type, + int device = 0, + execution_stream stream = execution_stream{}) : device_{[mem_type, &device]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; - }()}, - data_{[this, &other, mem_type, device, stream]() { - auto result = data_store{}; - auto result_data = static_cast(nullptr); - if (is_device_accessible(mem_type)) { - auto buf = detail::owning_buffer(other.size()); - result_data = buf.get(); - result = std::move(buf); - } else { - auto buf = detail::owning_buffer(std::get<1>(device_), other.size(), stream); - result_data = buf.get(); - result = std::move(buf); - } - copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); - return result; - }()}, - size_{other.size()}, - cached_ptr {[this](){ - auto result = static_cast(nullptr); - switch(data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - } - return result; - }()} + auto result = execution_device_id_variant{}; + if (is_device_accessible(mem_type)) { + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; + } + return result; + }()}, + data_{[this, &other, mem_type, device, stream]() { + auto result = data_store{}; + auto result_data = static_cast(nullptr); + if (is_device_accessible(mem_type)) { + auto buf = detail::owning_buffer(other.size()); + result_data = buf.get(); + result = std::move(buf); + } else { + auto buf = + detail::owning_buffer(std::get<1>(device_), other.size(), stream); + result_data = buf.get(); + result = std::move(buf); + } + copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); + return result; + }()}, + size_{other.size()}, + cached_ptr{[this]() { + auto result = static_cast(nullptr); + switch (data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} { } @@ -172,14 +174,16 @@ struct buffer { * The memory type of this new buffer will be the same as the original */ buffer(buffer const& other) : buffer(other, other.memory_type(), other.device_index()) {} - friend void swap(buffer& first, buffer& second) { + friend void swap(buffer& first, buffer& second) + { using std::swap; swap(first.device_, second.device_); swap(first.data_, second.data_); swap(first.size_, second.size_); swap(first.cached_ptr, second.cached_ptr); } - buffer& operator=(buffer other) { + buffer& operator=(buffer other) + { swap(*this, other); return *this; } @@ -188,7 +192,10 @@ struct buffer { * @brief Create owning copy of existing buffer with given stream * The memory type of this new buffer will be the same as the original */ - buffer(buffer const& other, execution_stream stream) : buffer(other, other.memory_type(), other.device_index(), stream) {} + buffer(buffer const& other, execution_stream stream) + : buffer(other, other.memory_type(), other.device_index(), stream) + { + } /** * @brief Move from existing buffer unless a copy is necessary based on @@ -196,45 +203,44 @@ struct buffer { */ buffer(buffer&& other, memory_type mem_type, int device, execution_stream stream) : device_{[mem_type, &device]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; - }()}, - data_{[&other, mem_type, device, stream]() { - auto result = data_store{}; - if (mem_type == other.memory_type() && device == other.device_index()) { - result = std::move(other.data_); - } else { - auto* result_data = static_cast(nullptr); + auto result = execution_device_id_variant{}; if (is_device_accessible(mem_type)) { - auto buf = detail::owning_buffer{device, other.size(), stream}; - result_data = buf.get(); - result = std::move(buf); + result = execution_device_id{device}; + } else { + result = execution_device_id{device}; } - else { - auto buf = detail::owning_buffer{other.size()}; - result_data = buf.get(); - result = std::move(buf); + return result; + }()}, + data_{[&other, mem_type, device, stream]() { + auto result = data_store{}; + if (mem_type == other.memory_type() && device == other.device_index()) { + result = std::move(other.data_); + } else { + auto* result_data = static_cast(nullptr); + if (is_device_accessible(mem_type)) { + auto buf = detail::owning_buffer{device, other.size(), stream}; + result_data = buf.get(); + result = std::move(buf); + } else { + auto buf = detail::owning_buffer{other.size()}; + result_data = buf.get(); + result = std::move(buf); + } + copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); } - copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); - } - return result; - }()}, - size_{other.size()}, - cached_ptr {[this](){ - auto result = static_cast(nullptr); - switch(data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - } - return result; - }()} + return result; + }()}, + size_{other.size()}, + cached_ptr{[this]() { + auto result = static_cast(nullptr); + switch (data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} { } buffer(buffer&& other, device_type mem_type, int device) @@ -246,40 +252,42 @@ struct buffer { { } - buffer(buffer&& other) : buffer{} { - swap(*this, other); - } + buffer(buffer&& other) : buffer{} { swap(*this, other); } template < typename iter_t, - typename = decltype(*std::declval(), void(), ++std::declval(), void()) - > + typename = decltype(*std::declval(), void(), ++std::declval(), void())> buffer(iter_t const& begin, iter_t const& end) : buffer{static_cast(std::distance(begin, end))} { auto index = std::size_t{}; - std::for_each(begin, end, [&index, this](auto&& val) { - data()[index++] = val; - }); + std::for_each(begin, end, [&index, this](auto&& val) { data()[index++] = val; }); } template < typename iter_t, - typename = decltype(*std::declval(), void(), ++std::declval(), void()) - > - buffer(iter_t const& begin, iter_t const& end, device_type mem_type) : buffer{buffer{begin, end}, mem_type} { } + typename = decltype(*std::declval(), void(), ++std::declval(), void())> + buffer(iter_t const& begin, iter_t const& end, device_type mem_type) + : buffer{buffer{begin, end}, mem_type} + { + } template < typename iter_t, - typename = decltype(*std::declval(), void(), ++std::declval(), void()) - > - buffer(iter_t const& begin, iter_t const& end, device_type mem_type, int device, execution_stream stream=execution_stream{}) : buffer{buffer{begin, end}, mem_type, device, stream} { } + typename = decltype(*std::declval(), void(), ++std::declval(), void())> + buffer(iter_t const& begin, + iter_t const& end, + device_type mem_type, + int device, + execution_stream stream = execution_stream{}) + : buffer{buffer{begin, end}, mem_type, device, stream} + { + } auto size() const noexcept { return size_; } - HOST DEVICE auto* data() const noexcept { - return cached_ptr; - } - auto device_type() const noexcept { + HOST DEVICE auto* data() const noexcept { return cached_ptr; } + auto device_type() const noexcept + { enum device_type result; if (device_.index() == 0) { result = device_type::cpu; @@ -289,13 +297,12 @@ struct buffer { return result; } - auto device() const noexcept { - return device_; - } + auto device() const noexcept { return device_; } - auto device_index() const noexcept { + auto device_index() const noexcept + { auto result = int{}; - switch(device_.index()) { + switch (device_.index()) { case 0: result = std::get<0>(device_).value(); break; case 1: result = std::get<1>(device_).value(); break; } @@ -310,46 +317,78 @@ struct buffer { T* cached_ptr; }; -template -detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, execution_stream stream) { +template +detail::const_agnostic_same_t copy(buffer& dst, + buffer const& src, + typename buffer::index_type dst_offset, + typename buffer::index_type src_offset, + typename buffer::index_type size, + execution_stream stream) +{ if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } - detail::buffer_copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); + detail::buffer_copy(dst.data() + dst_offset, + src.data() + src_offset, + size, + dst.memory_type(), + src.memory_type(), + stream); } -template -detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, execution_stream stream) { +template +detail::const_agnostic_same_t copy(buffer& dst, + buffer const& src, + execution_stream stream) +{ detail::buffer_copy(dst, src, 0, 0, src.size(), stream); } -template -detail::const_agnostic_same_t copy(buffer& dst, buffer const& src) { +template +detail::const_agnostic_same_t copy(buffer& dst, buffer const& src) +{ detail::buffer_copy(dst, src, 0, 0, src.size(), execution_stream{}); } -template -detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, typename buffer::index_type src_offset, typename buffer::index_type size, execution_stream stream) { +template +detail::const_agnostic_same_t copy(buffer&& dst, + buffer&& src, + typename buffer::index_type dst_offset, + typename buffer::index_type src_offset, + typename buffer::index_type size, + execution_stream stream) +{ if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } - detail::buffer_copy(dst.data() + dst_offset, src.data() + src_offset, size, dst.memory_type(), src.memory_type(), stream); + detail::buffer_copy(dst.data() + dst_offset, + src.data() + src_offset, + size, + dst.memory_type(), + src.memory_type(), + stream); } -template -detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, typename buffer::index_type dst_offset, execution_stream stream) { +template +detail::const_agnostic_same_t copy(buffer&& dst, + buffer&& src, + typename buffer::index_type dst_offset, + execution_stream stream) +{ detail::buffer_copy(dst, src, dst_offset, 0, src.size(), stream); } -template -detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, execution_stream stream) { +template +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, execution_stream stream) +{ detail::buffer_copy(dst, src, 0, 0, src.size(), stream); } -template -detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src) { +template +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src) +{ detail::buffer_copy(dst, src, 0, 0, src.size(), execution_stream{}); } diff --git a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp index 92bb674f3a..ac70e77ab9 100644 --- a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp @@ -14,9 +14,9 @@ * limitations under the License. */ #pragma once +#include #include #include -#include #ifndef RAFT_DISABLE_CUDA #include #endif @@ -24,47 +24,75 @@ namespace raft { namespace detail { -template -void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) { +template +void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) +{ buffer_copy(dst + dst_offset, src + src_offset, size, execution_stream{}); } -template -void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { +template +void buffer_copy(T* dst, + T const* src, + uint32_t size, + uint32_t dst_offset, + uint32_t src_offset, + execution_stream stream) +{ buffer_copy(dst + dst_offset, src + src_offset, size, stream); } -template -void buffer_copy(T* dst, T const* src, uint32_t size) { +template +void buffer_copy(T* dst, T const* src, uint32_t size) +{ buffer_copy(dst, src, size, execution_stream{}); } -template -void buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { +template +void buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) +{ buffer_copy(dst, src, size, stream); } -template -void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, uint32_t dst_offset, uint32_t src_offset, execution_stream stream) { +template +void buffer_copy(T* dst, + T const* src, + uint32_t size, + device_type dst_type, + device_type src_type, + uint32_t dst_offset, + uint32_t src_offset, + execution_stream stream) +{ if (dst_type == device_type::gpu && src_type == device_type::gpu) { - buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy( + dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { - buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy( + dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { - buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy( + dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { - buffer_copy(dst + dst_offset, src + src_offset, size, stream); + buffer_copy( + dst + dst_offset, src + src_offset, size, stream); } } -template -void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) { +template +void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) +{ buffer_copy(dst, src, size, dst_type, src_type, 0, 0, execution_stream{}); } -template -void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, execution_stream stream) { +template +void buffer_copy(T* dst, + T const* src, + uint32_t size, + device_type dst_type, + device_type src_type, + execution_stream stream) +{ buffer_copy(dst, src, size, dst_type, src_type, 0, 0, stream); } -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp index 272c589b4f..5fc0064feb 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp @@ -16,22 +16,32 @@ #pragma once #include #include +#include #include #include -#include namespace raft { namespace detail { -template -std::enable_if_t, std::bool_constant>, void> buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { +template +std::enable_if_t, + std::bool_constant>, + void> +buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) +{ std::copy(src, src + size, dst); } -template -std::enable_if_t, std::bool_constant>, std::bool_constant>, void> buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { +template +std::enable_if_t< + std::conjunction_v, + std::bool_constant>, + std::bool_constant>, + void> +buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) +{ throw raft::cuda_unsupported("Copying from or to device in non-GPU build"); } -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp index f12998d8c4..06e059ed1d 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -14,12 +14,12 @@ * limitations under the License. */ #pragma once -#include "raft/util/cuda_rt_essentials.hpp" -#include "raft/util/cudart_utils.hpp" #include +#include #include #include -#include +#include +#include #include #include @@ -27,10 +27,16 @@ namespace raft { namespace detail { -template -std::enable_if_t, std::bool_constant>, std::bool_constant>, void> buffer_copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) { +template +std::enable_if_t< + std::conjunction_v, + std::bool_constant>, + std::bool_constant>, + void> +buffer_copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) +{ RAFT_CUDA_TRY(thrust::copy(rmm::exec_policy(stream), src, src + size, dst)); } -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index 7f2155e8a2..62a08b469f 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -14,17 +14,17 @@ * limitations under the License. */ #pragma once -#include #include +#include namespace raft { namespace detail { -template +template class non_owning_buffer { using value_type = std::remove_const_t; - non_owning_buffer() : data_{nullptr} { } + non_owning_buffer() : data_{nullptr} {} - non_owning_buffer(T* ptr) : data_{ptr} { } + non_owning_buffer(T* ptr) : data_{ptr} {} auto* get() const { return data_; } @@ -32,5 +32,5 @@ class non_owning_buffer { // TODO(wphicks): Back this with RMM-allocated host memory T* data_; }; -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp index f9531ab21f..b8bad96dd4 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp @@ -14,15 +14,15 @@ * limitations under the License. */ #pragma once -#include #include "owning_buffer_cpu.hpp" +#include #ifndef RAFT_DISABLE_CUDA #include "owning_buffer_gpu.hpp" #endif namespace raft { namespace detail { -template +template using owning_buffer = owning_buffer; -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp index 4c7531dd2d..c6f4b13856 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -14,20 +14,20 @@ * limitations under the License. */ #pragma once -#include -#include #include +#include +#include #include namespace raft { namespace detail { -template +template class owning_buffer { owning_buffer() {} owning_buffer(execution_device_id device_id, std::size_t size, execution_stream stream) {} auto* get() const { return static_cast(nullptr); } }; -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index a70ff60ce1..04a6a5033c 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -14,28 +14,22 @@ * limitations under the License. */ #pragma once -#include -#include #include "owning_buffer_base.hpp" #include +#include +#include #include namespace raft { namespace detail { -template +template class owning_buffer { // TODO(wphicks): Assess need for buffers of const T using value_type = std::remove_const_t; - owning_buffer() - : data_{std::unique_ptr{nullptr}} - { - } + owning_buffer() : data_{std::unique_ptr{nullptr}} {} - owning_buffer(std::size_t size) - : data_{std::make_unique(size)} - { - } + owning_buffer(std::size_t size) : data_{std::make_unique(size)} {} auto* get() const { return data_.get(); } @@ -43,5 +37,5 @@ class owning_buffer { // TODO(wphicks): Back this with RMM-allocated host memory std::unique_ptr data_; }; -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 1922022755..c152fcff77 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -14,25 +14,27 @@ * limitations under the License. */ #pragma once +#include "owning_buffer_base.hpp" #include -#include -#include #include -#include "owning_buffer_base.hpp" +#include +#include #include namespace raft { namespace detail { -template +template class owning_buffer { using value_type = std::remove_const_t; owning_buffer() : data_{} {} - owning_buffer(execution_device_id execution_device_id, std::size_t size, cudaStream_t stream) noexcept(false) + owning_buffer(execution_device_id execution_device_id, + std::size_t size, + cudaStream_t stream) noexcept(false) : data_{[&execution_device_id, &size, &stream]() { - auto device_context = device_setter{execution_device_id}; - return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}}; - }()} + auto device_context = device_setter{execution_device_id}; + return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}}; + }()} { } @@ -41,5 +43,5 @@ class owning_buffer { private: mutable rmm::device_buffer data_; }; -} // namespace detail -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/const_agnostic.hpp b/cpp/include/raft/core/detail/const_agnostic.hpp index e0e20db3dc..85e99806b6 100644 --- a/cpp/include/raft/core/detail/const_agnostic.hpp +++ b/cpp/include/raft/core/detail/const_agnostic.hpp @@ -24,4 +24,4 @@ using const_agnostic_same_t = template inline constexpr auto const_agnostic_same_v = std::is_same_v, std::remove_const_t>; -} +} // namespace raft::detail diff --git a/cpp/include/raft/core/detail/device_setter_base.hpp b/cpp/include/raft/core/detail/device_setter_base.hpp index cebc3a5b4d..e6cee3f5e4 100644 --- a/cpp/include/raft/core/detail/device_setter_base.hpp +++ b/cpp/include/raft/core/detail/device_setter_base.hpp @@ -26,5 +26,5 @@ class device_setter { device_setter(execution_device_id device) {} }; -} -} \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/device_setter_gpu.hpp b/cpp/include/raft/core/detail/device_setter_gpu.hpp index 1468aaae6f..babb7c89b3 100644 --- a/cpp/include/raft/core/detail/device_setter_gpu.hpp +++ b/cpp/include/raft/core/detail/device_setter_gpu.hpp @@ -14,11 +14,11 @@ * limitations under the License. */ #pragma once -#include "raft/util/cuda_rt_essentials.hpp" #include #include #include #include +#include #include namespace raft { @@ -27,20 +27,21 @@ namespace detail { /** Class for setting current device within a code block */ template <> class device_setter { - device_setter(raft::execution_device_id device) noexcept(false) : prev_device_{[]() { - auto result = int{}; - RAFT_CUDA_TRY(cudaGetDevice(&result)); - return result; - }()} { + device_setter(raft::execution_device_id device) noexcept(false) + : prev_device_{[]() { + auto result = int{}; + RAFT_CUDA_TRY(cudaGetDevice(&result)); + return result; + }()} + { RAFT_CUDA_TRY(cudaSetDevice(device.value())); } - ~device_setter() { - RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); - } + ~device_setter() { RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); } + private: execution_device_id prev_device_; }; -} -} +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/detail/execution_device_id_base.hpp b/cpp/include/raft/core/detail/execution_device_id_base.hpp index 2e9d13a6e2..fd417d44f1 100644 --- a/cpp/include/raft/core/detail/execution_device_id_base.hpp +++ b/cpp/include/raft/core/detail/execution_device_id_base.hpp @@ -18,12 +18,12 @@ namespace raft { namespace detail { -template +template struct execution_device_id { using value_type = int; execution_device_id(value_type device_index) {} auto value() const { return value_type{}; } }; -} -} +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/detail/execution_device_id_cpu.hpp b/cpp/include/raft/core/detail/execution_device_id_cpu.hpp index d9317bc51f..56b52a6e4c 100644 --- a/cpp/include/raft/core/detail/execution_device_id_cpu.hpp +++ b/cpp/include/raft/core/detail/execution_device_id_cpu.hpp @@ -26,8 +26,9 @@ struct execution_device_id { execution_device_id(value_type dev_id) : id_{dev_id} {}; auto value() const noexcept { return id_; } + private: value_type id_; }; -} -} \ No newline at end of file +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/execution_device_id_gpu.hpp b/cpp/include/raft/core/detail/execution_device_id_gpu.hpp index 771c0b0b5c..a039c8ee02 100644 --- a/cpp/include/raft/core/detail/execution_device_id_gpu.hpp +++ b/cpp/include/raft/core/detail/execution_device_id_gpu.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/core/device_support.hpp b/cpp/include/raft/core/device_support.hpp index 1bb58195d7..ba39c1b29c 100644 --- a/cpp/include/raft/core/device_support.hpp +++ b/cpp/include/raft/core/device_support.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,11 +21,11 @@ namespace raft { #ifdef RAFT_DISABLE_CUDA auto constexpr static const CUDA_ENABLED = false; #else -auto constexpr static const CUDA_ENABLED = true; +auto constexpr static const CUDA_ENABLED = true; #endif #ifdef __CUDACC__ -#define HOST __host__ +#define HOST __host__ #define DEVICE __device__ auto constexpr static const GPU_COMPILATION = true; #else @@ -37,7 +37,7 @@ auto constexpr static const GPU_COMPILATION = false; #ifndef DEBUG auto constexpr static const DEBUG_ENABLED = false; #elif DEBUG == 0 -auto constexpr static const DEBUG_ENABLED = false; +auto constexpr static const DEBUG_ENABLED = false; #else auto constexpr static const DEBUG_ENABLED = true; #endif diff --git a/cpp/include/raft/core/device_type.hpp b/cpp/include/raft/core/device_type.hpp index 11938e8032..a411c8bef7 100644 --- a/cpp/include/raft/core/device_type.hpp +++ b/cpp/include/raft/core/device_type.hpp @@ -16,14 +16,11 @@ #pragma once #include namespace raft { -enum class device_type { - cpu, - gpu -}; +enum class device_type { cpu, gpu }; auto constexpr is_compatible(device_type dev_type, memory_type mem_type) { return (dev_type == device_type::gpu && is_device_accessible(mem_type)) || (dev_type == device_type::cpu && is_host_accessible(mem_type)); } -} \ No newline at end of file +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/exceptions.hpp b/cpp/include/raft/core/exceptions.hpp index 39afdce567..bdd5e03856 100644 --- a/cpp/include/raft/core/exceptions.hpp +++ b/cpp/include/raft/core/exceptions.hpp @@ -36,9 +36,9 @@ struct out_of_bounds : raft::exception { }; struct wrong_device_type : raft::exception { - wrong_device_type() : wrong_device_type( - "Attempted to use host data on GPU or device data on CPU" - ) {} + wrong_device_type() : wrong_device_type("Attempted to use host data on GPU or device data on CPU") + { + } wrong_device_type(char const* msg) : msg_{msg} {} virtual char const* what() const noexcept { return msg_; } @@ -47,9 +47,7 @@ struct wrong_device_type : raft::exception { }; struct mem_type_mismatch : raft::exception { - mem_type_mismatch() : mem_type_mismatch( - "Memory type does not match expected type" - ) {} + mem_type_mismatch() : mem_type_mismatch("Memory type does not match expected type") {} mem_type_mismatch(char const* msg) : msg_{msg} {} virtual char const* what() const noexcept { return msg_; } @@ -58,9 +56,7 @@ struct mem_type_mismatch : raft::exception { }; struct wrong_device : raft::exception { - wrong_device() : wrong_device( - "Attempted to use incorrect device" - ) {} + wrong_device() : wrong_device("Attempted to use incorrect device") {} wrong_device(char const* msg) : msg_{msg} {} virtual char const* what() const noexcept { return msg_; } @@ -68,4 +64,4 @@ struct wrong_device : raft::exception { char const* msg_; }; -} \ No newline at end of file +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/execution_device_id.hpp b/cpp/include/raft/core/execution_device_id.hpp index dedc4b5518..5c7bae4575 100644 --- a/cpp/include/raft/core/execution_device_id.hpp +++ b/cpp/include/raft/core/execution_device_id.hpp @@ -27,5 +27,6 @@ namespace raft { template using execution_device_id = detail::execution_device_id; -using execution_device_id_variant = std::variant, execution_device_id>; -} +using execution_device_id_variant = + std::variant, execution_device_id>; +} // namespace raft diff --git a/cpp/include/raft/core/execution_stream.hpp b/cpp/include/raft/core/execution_stream.hpp index e319dc866f..945d6c55b7 100644 --- a/cpp/include/raft/core/execution_stream.hpp +++ b/cpp/include/raft/core/execution_stream.hpp @@ -24,9 +24,10 @@ using execution_stream = cudaStream_t; #else using execution_stream = int; #endif -inline void synchronize(execution_stream stream) { +inline void synchronize(execution_stream stream) +{ #ifndef RAFT_DISABLE_CUDA cudaStreamSynchronize(stream); #endif } -} \ No newline at end of file +} // namespace raft \ No newline at end of file diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index 8b7db3ada0..d7095aa00c 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -35,7 +35,6 @@ license = { text = "Apache 2.0" } requires-python = ">=3.8" dependencies = [ "dask-cuda==23.6.*", - "dask-cuda==23.4.*", "dask==2023.3.2", "distributed==2023.3.2.1", "joblib>=0.11", From f72f7f80f566048a47ea6adea576ec536326a60f Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 7 Apr 2023 12:30:56 -0700 Subject: [PATCH 09/75] merge --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a18bb387f6..dc8f7b6f2b 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -53,4 +53,4 @@ jobs: package-name: raft_dask test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" - test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test" \ No newline at end of file + test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test" From 05f9daa4f10546b3d3b8485e8020558930b7e4f2 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 7 Apr 2023 12:32:29 -0700 Subject: [PATCH 10/75] merge dependencies.yaml --- dependencies.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies.yaml b/dependencies.yaml index 280e355d81..f3e0cd1167 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -292,4 +292,4 @@ dependencies: packages: - cupy - scikit-learn - - scipy \ No newline at end of file + - scipy From 02509310d8866f2ff16724045490229c7efc6ed2 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 10 Apr 2023 12:08:01 -0700 Subject: [PATCH 11/75] Updates --- cpp/CMakeLists.txt | 8 + cpp/include/raft/core/buffer.hpp | 112 +++---- .../core/detail/buffer_utils/buffer_copy.hpp | 16 +- .../core/detail/buffer_utils/copy_cpu.hpp | 4 +- .../core/detail/buffer_utils/copy_gpu.hpp | 5 +- .../detail/buffer_utils/non_owning_buffer.hpp | 2 +- .../detail/buffer_utils/owning_buffer.hpp | 9 +- .../buffer_utils/owning_buffer_base.hpp | 2 +- .../detail/buffer_utils/owning_buffer_cpu.hpp | 2 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 2 +- .../raft/core/detail/device_setter_base.hpp | 2 +- .../raft/core/detail/device_setter_gpu.hpp | 2 +- cpp/include/raft/core/device_mdbuffer.hpp | 316 ------------------ cpp/test/core/buffer.cu | 58 ++++ 14 files changed, 130 insertions(+), 410 deletions(-) delete mode 100644 cpp/include/raft/core/device_mdbuffer.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1355b77875..3d9d7c9419 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -56,6 +56,7 @@ option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and librari option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF) option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) +option(DISABLE_CUDA "Disable CUDA in supported RAFT code" OFF) option(DISABLE_OPENMP "Disable OpenMP" OFF) option(RAFT_NVTX "Enable nvtx markers" OFF) @@ -247,6 +248,13 @@ target_compile_definitions(raft::raft INTERFACE $<$:NVTX_ENAB ) endif() +############################################################################## +# - CUDA-free build support -------------------------------------------------- + +if (DISABLE_CUDA) + target_compile_definitions(raft INTERFACE RAFT_DISABLE_CUDA) +endif() + # ################################################################################################## # * raft_compiled ------------------------------------------------------------ TODO: Currently, this # package also contains the 'random' namespace (for rmat logic) We couldn't get this to work diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index 34f13bf3c7..0f73b6d41c 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -47,7 +47,7 @@ struct buffer { detail::owning_buffer, detail::owning_buffer>; - buffer() : device_{}, data_{}, size_{} {} + buffer() : device_{}, data_{}, size_{}, memory_type_{memory_type::host} {} /** Construct non-initialized owning buffer */ buffer(index_type size, @@ -73,6 +73,7 @@ struct buffer { return result; }()}, size_{size}, + memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { @@ -107,6 +108,7 @@ struct buffer { return result; }()}, size_{size}, + memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { @@ -121,8 +123,7 @@ struct buffer { } /** - * @brief Construct one buffer from another in the given memory location - * (either on host or on device) + * @brief Construct one buffer from another of the given memory type * A buffer constructed in this way is owning and will copy the data from * the original location */ @@ -143,19 +144,21 @@ struct buffer { auto result = data_store{}; auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { - auto buf = detail::owning_buffer(other.size()); + auto buf = + detail::owning_buffer(std::get<1>(device_), other.size(), stream); result_data = buf.get(); result = std::move(buf); + // detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.device_type(), stream); } else { - auto buf = - detail::owning_buffer(std::get<1>(device_), other.size(), stream); + auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); + detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.device_type(), stream); } - copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); return result; }()}, size_{other.size()}, + memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { @@ -173,7 +176,11 @@ struct buffer { * @brief Create owning copy of existing buffer * The memory type of this new buffer will be the same as the original */ - buffer(buffer const& other) : buffer(other, other.memory_type(), other.device_index()) {} + buffer(buffer const& other) : buffer(other, + other.memory_type(), + other.device_index()) + { + } friend void swap(buffer& first, buffer& second) { using std::swap; @@ -190,10 +197,9 @@ struct buffer { /** * @brief Create owning copy of existing buffer with given stream - * The memory type of this new buffer will be the same as the original + * The device type of this new buffer will be the same as the original */ - buffer(buffer const& other, execution_stream stream) - : buffer(other, other.memory_type(), other.device_index(), stream) + buffer(buffer const& other, execution_stream stream) : buffer(other, other.memory_type(), other.device_index(), stream) { } @@ -221,16 +227,18 @@ struct buffer { auto buf = detail::owning_buffer{device, other.size(), stream}; result_data = buf.get(); result = std::move(buf); + detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.device_type(), stream); } else { auto buf = detail::owning_buffer{other.size()}; result_data = buf.get(); result = std::move(buf); + detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.device_type(), stream); } - copy(result_data, other.data(), other.size(), mem_type, other.memory_type(), stream); } return result; }()}, size_{other.size()}, + memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { @@ -286,16 +294,6 @@ struct buffer { auto size() const noexcept { return size_; } HOST DEVICE auto* data() const noexcept { return cached_ptr; } - auto device_type() const noexcept - { - enum device_type result; - if (device_.index() == 0) { - result = device_type::cpu; - } else { - result = device_type::gpu; - } - return result; - } auto device() const noexcept { return device_; } @@ -308,12 +306,30 @@ struct buffer { } return result; } + + auto memory_type() const noexcept + { + return memory_type_; + } + ~buffer() = default; private: + auto device_type() const noexcept + { + enum device_type result; + if (device_.index() == 0) { + result = device_type::cpu; + } else { + result = device_type::gpu; + } + return result; + } + execution_device_id_variant device_; data_store data_; index_type size_; + enum memory_type memory_type_; T* cached_ptr; }; @@ -330,11 +346,13 @@ detail::const_agnostic_same_t copy(buffer& dst, throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } + auto src_device_type = is_device_accessible(src.memory_type()) ? device_type::gpu : device_type::cpu; + auto dst_device_type = is_device_accessible(dst.memory_type()) ? device_type::gpu : device_type::cpu; detail::buffer_copy(dst.data() + dst_offset, src.data() + src_offset, size, - dst.memory_type(), - src.memory_type(), + dst_device_type, + src_device_type, stream); } @@ -343,53 +361,11 @@ detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, execution_stream stream) { - detail::buffer_copy(dst, src, 0, 0, src.size(), stream); + copy(dst, src, 0, 0, src.size(), stream); } template detail::const_agnostic_same_t copy(buffer& dst, buffer const& src) { - detail::buffer_copy(dst, src, 0, 0, src.size(), execution_stream{}); + copy(dst, src, 0, 0, src.size(), execution_stream{}); } - -template -detail::const_agnostic_same_t copy(buffer&& dst, - buffer&& src, - typename buffer::index_type dst_offset, - typename buffer::index_type src_offset, - typename buffer::index_type size, - execution_stream stream) -{ - if constexpr (bounds_check) { - if (src.size() - src_offset < size || dst.size() - dst_offset < size) { - throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); - } - } - detail::buffer_copy(dst.data() + dst_offset, - src.data() + src_offset, - size, - dst.memory_type(), - src.memory_type(), - stream); -} - -template -detail::const_agnostic_same_t copy(buffer&& dst, - buffer&& src, - typename buffer::index_type dst_offset, - execution_stream stream) -{ - detail::buffer_copy(dst, src, dst_offset, 0, src.size(), stream); -} - -template -detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, execution_stream stream) -{ - detail::buffer_copy(dst, src, 0, 0, src.size(), stream); -} -template -detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src) -{ - detail::buffer_copy(dst, src, 0, 0, src.size(), execution_stream{}); -} - } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp index ac70e77ab9..715d65d38d 100644 --- a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp @@ -27,7 +27,7 @@ namespace detail { template void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) { - buffer_copy(dst + dst_offset, src + src_offset, size, execution_stream{}); + copy(dst + dst_offset, src + src_offset, size, execution_stream{}); } template @@ -38,19 +38,19 @@ void buffer_copy(T* dst, uint32_t src_offset, execution_stream stream) { - buffer_copy(dst + dst_offset, src + src_offset, size, stream); + copy(dst + dst_offset, src + src_offset, size, stream); } template void buffer_copy(T* dst, T const* src, uint32_t size) { - buffer_copy(dst, src, size, execution_stream{}); + copy(dst, src, size, execution_stream{}); } template void buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) { - buffer_copy(dst, src, size, stream); + copy(dst, src, size, stream); } template @@ -64,16 +64,16 @@ void buffer_copy(T* dst, execution_stream stream) { if (dst_type == device_type::gpu && src_type == device_type::gpu) { - buffer_copy( + copy( dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { - buffer_copy( + copy( dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { - buffer_copy( + copy( dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { - buffer_copy( + copy( dst + dst_offset, src + src_offset, size, stream); } } diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp index 5fc0064feb..2555c251b3 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp @@ -27,7 +27,7 @@ template std::enable_if_t, std::bool_constant>, void> -buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) +copy(T* dst, T const* src, uint32_t size, execution_stream stream) { std::copy(src, src + size, dst); } @@ -38,7 +38,7 @@ std::enable_if_t< std::bool_constant>, std::bool_constant>, void> -buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) +copy(T* dst, T const* src, uint32_t size, execution_stream stream) { throw raft::cuda_unsupported("Copying from or to device in non-GPU build"); } diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp index 06e059ed1d..3b5afe7ce0 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -33,9 +34,9 @@ std::enable_if_t< std::bool_constant>, std::bool_constant>, void> -buffer_copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) +copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) { - RAFT_CUDA_TRY(thrust::copy(rmm::exec_policy(stream), src, src + size, dst)); + thrust::copy(rmm::exec_policy(stream), src, src + size, dst); } } // namespace detail diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index 62a08b469f..f1e2361ee3 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -20,7 +20,7 @@ namespace raft { namespace detail { template -class non_owning_buffer { +struct non_owning_buffer { using value_type = std::remove_const_t; non_owning_buffer() : data_{nullptr} {} diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp index b8bad96dd4..c9f1aeca06 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp @@ -18,11 +18,4 @@ #include #ifndef RAFT_DISABLE_CUDA #include "owning_buffer_gpu.hpp" -#endif -namespace raft { -namespace detail { -template -using owning_buffer = owning_buffer; - -} // namespace detail -} // namespace raft \ No newline at end of file +#endif \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp index c6f4b13856..c112844a3a 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -23,7 +23,7 @@ namespace raft { namespace detail { template -class owning_buffer { +struct owning_buffer { owning_buffer() {} owning_buffer(execution_device_id device_id, std::size_t size, execution_stream stream) {} auto* get() const { return static_cast(nullptr); } diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index 04a6a5033c..dad4cb2da2 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -23,7 +23,7 @@ namespace raft { namespace detail { template -class owning_buffer { +struct owning_buffer { // TODO(wphicks): Assess need for buffers of const T using value_type = std::remove_const_t; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index c152fcff77..662d4caeae 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -24,7 +24,7 @@ namespace raft { namespace detail { template -class owning_buffer { +struct owning_buffer { using value_type = std::remove_const_t; owning_buffer() : data_{} {} diff --git a/cpp/include/raft/core/detail/device_setter_base.hpp b/cpp/include/raft/core/detail/device_setter_base.hpp index e6cee3f5e4..b3b84f3613 100644 --- a/cpp/include/raft/core/detail/device_setter_base.hpp +++ b/cpp/include/raft/core/detail/device_setter_base.hpp @@ -22,7 +22,7 @@ namespace detail { /** Struct for setting current device within a code block */ template -class device_setter { +struct device_setter { device_setter(execution_device_id device) {} }; diff --git a/cpp/include/raft/core/detail/device_setter_gpu.hpp b/cpp/include/raft/core/detail/device_setter_gpu.hpp index babb7c89b3..98cb682de6 100644 --- a/cpp/include/raft/core/detail/device_setter_gpu.hpp +++ b/cpp/include/raft/core/detail/device_setter_gpu.hpp @@ -26,7 +26,7 @@ namespace detail { /** Class for setting current device within a code block */ template <> -class device_setter { +struct device_setter { device_setter(raft::execution_device_id device) noexcept(false) : prev_device_{[]() { auto result = int{}; diff --git a/cpp/include/raft/core/device_mdbuffer.hpp b/cpp/include/raft/core/device_mdbuffer.hpp deleted file mode 100644 index f72ae36d64..0000000000 --- a/cpp/include/raft/core/device_mdbuffer.hpp +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace raft { - -template -using device_accessor = host_device_accessor; - -template -using managed_accessor = host_device_accessor; - -/** - * @brief std::experimental::mdspan with device tag to avoid accessing incorrect memory location. - */ -template > -using device_mdspan = mdspan>; - -template > -using managed_mdspan = mdspan>; - -template -struct is_device_mdspan : std::false_type { -}; -template -struct is_device_mdspan : std::bool_constant { -}; - -/** - * @\brief Boolean to determine if template type T is either raft::device_mdspan or a derived type - */ -template -using is_device_mdspan_t = is_device_mdspan>; - -template -using is_input_device_mdspan_t = is_device_mdspan>; - -template -using is_output_device_mdspan_t = is_device_mdspan>; - -template -struct is_managed_mdspan : std::false_type { -}; -template -struct is_managed_mdspan : std::bool_constant { -}; - -/** - * @\brief Boolean to determine if template type T is either raft::managed_mdspan or a derived type - */ -template -using is_managed_mdspan_t = is_managed_mdspan>; - -template -using is_input_managed_mdspan_t = is_managed_mdspan>; - -template -using is_output_managed_mdspan_t = is_managed_mdspan>; - -/** - * @\brief Boolean to determine if variadic template types Tn are either raft::device_mdspan or a - * derived type - */ -template -inline constexpr bool is_device_mdspan_v = std::conjunction_v...>; - -template -inline constexpr bool is_input_device_mdspan_v = - std::conjunction_v...>; - -template -inline constexpr bool is_output_device_mdspan_v = - std::conjunction_v...>; - -template -using enable_if_device_mdspan = std::enable_if_t>; - -template -using enable_if_input_device_mdspan = std::enable_if_t>; - -template -using enable_if_output_device_mdspan = std::enable_if_t>; - -/** - * @\brief Boolean to determine if variadic template types Tn are either raft::managed_mdspan or a - * derived type - */ -template -inline constexpr bool is_managed_mdspan_v = std::conjunction_v...>; - -template -inline constexpr bool is_input_managed_mdspan_v = - std::conjunction_v...>; - -template -inline constexpr bool is_output_managed_mdspan_v = - std::conjunction_v...>; - -template -using enable_if_managed_mdspan = std::enable_if_t>; - -template -using enable_if_input_managed_mdspan = std::enable_if_t>; - -template -using enable_if_output_managed_mdspan = std::enable_if_t>; - -/** - * @brief Shorthand for 0-dim host mdspan (scalar). - * @tparam ElementType the data type of the scalar element - * @tparam IndexType the index type of the extents - */ -template -using device_scalar_view = device_mdspan>; - -/** - * @brief Shorthand for 1-dim device mdspan. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using device_vector_view = device_mdspan, LayoutPolicy>; - -/** - * @brief Shorthand for c-contiguous device matrix view. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - */ -template -using device_matrix_view = device_mdspan, LayoutPolicy>; - -/** - * @brief Shorthand for 128 byte aligned device matrix view. - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy must be of type layout_{left/right}_padded - */ -template , - typename = enable_if_layout_padded> -using device_aligned_matrix_view = - device_mdspan, - LayoutPolicy, - std::experimental::aligned_accessor>; - -/** - * @brief Create a 2-dim 128 byte aligned mdspan instance for device pointer. It's - * expected that the given layout policy match the layout of the underlying - * pointer. - * @tparam ElementType the data type of the matrix elements - * @tparam LayoutPolicy must be of type layout_{left/right}_padded - * @tparam IndexType the index type of the extents - * @param[in] ptr on device to wrap - * @param[in] n_rows number of rows in pointer - * @param[in] n_cols number of columns in pointer - */ -template > -auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) -{ - using data_handle_type = - typename std::experimental::aligned_accessor::data_handle_type; - static_assert(std::is_same>::value || - std::is_same>::value); - assert(reinterpret_cast(ptr) == - std::experimental::details::alignTo(reinterpret_cast(ptr), - detail::alignment::value)); - - data_handle_type aligned_pointer = ptr; - - matrix_extent extents{n_rows, n_cols}; - return device_aligned_matrix_view{aligned_pointer, extents}; -} - -/** - * @brief Create a raft::managed_mdspan - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param ptr Pointer to the data - * @param exts dimensionality of the array (series of integers) - * @return raft::managed_mdspan - */ -template -auto make_managed_mdspan(ElementType* ptr, extents exts) -{ - return make_mdspan(ptr, exts); -} - -/** - * @brief Create a 0-dim (scalar) mdspan instance for device value. - * - * @tparam ElementType the data type of the matrix elements - * @tparam IndexType the index type of the extents - * @param[in] ptr on device to wrap - */ -template -auto make_device_scalar_view(ElementType* ptr) -{ - scalar_extent extents; - return device_scalar_view{ptr, extents}; -} - -/** - * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's - * expected that the given layout policy match the layout of the underlying - * pointer. - * @tparam ElementType the data type of the matrix elements - * @tparam LayoutPolicy policy for strides and layout ordering - * @tparam IndexType the index type of the extents - * @param[in] ptr on device to wrap - * @param[in] n_rows number of rows in pointer - * @param[in] n_cols number of columns in pointer - */ -template -auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) -{ - matrix_extent extents{n_rows, n_cols}; - return device_matrix_view{ptr, extents}; -} - -/** - * @brief Create a 1-dim mdspan instance for device pointer. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] ptr on device to wrap - * @param[in] n number of elements in pointer - * @return raft::device_vector_view - */ -template -auto make_device_vector_view(ElementType* ptr, IndexType n) -{ - return device_vector_view{ptr, n}; -} - -/** - * @brief Create a 1-dim mdspan instance for device pointer. - * @tparam ElementType the data type of the vector elements - * @tparam IndexType the index type of the extents - * @tparam LayoutPolicy policy for strides and layout ordering - * @param[in] ptr on device to wrap - * @param[in] mapping The layout mapping to use for this vector - * @return raft::device_vector_view - */ -template -auto make_device_vector_view( - ElementType* ptr, - const typename LayoutPolicy::template mapping>& mapping) -{ - return device_vector_view{ptr, mapping}; -} - -/** - * @brief Construct a strided vector layout mapping - * - * Usage example: - * @code{.cpp} - * #include - * - * int n_elements = 10; - * int stride = 10; - * auto vector = raft::make_device_vector_view(vector_ptr, - * raft::make_vector_strided_layout(n_elements, stride)); - * @endcode - * - * @tparam IndexType the index type of the extents - * @param[in] n the number of elements in the vector - * @param[in] stride the stride between elements in the vector - */ -template -auto make_vector_strided_layout(IndexType n, IndexType stride) -{ - return make_strided_layout(vector_extent{n}, std::array{stride}); -} -} // end namespace raft diff --git a/cpp/test/core/buffer.cu b/cpp/test/core/buffer.cu index e69de29bb2..5881dfffc8 100644 --- a/cpp/test/core/buffer.cu +++ b/cpp/test/core/buffer.cu @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { + +__global__ void check_buffer_access(int* buf) { + if (buf[0] == 1) { + buf[0] = 4; + } + if (buf[1] == 2) { + buf[1] = 5; + } + if (buf[2] == 3) { + buf[2] = 6; + } +} + +TEST(Buffer, device_buffer_access) +{ + auto data = std::vector{1, 2, 3}; + auto expected = std::vector{4, 5, 6}; + auto buf = buffer( + buffer(data.data(), data.size(), memory_type::host), + memory_type::device, + 0, + execution_stream{} + ); + // check_buffer_access<<<1,1>>>(buf.data()); + // auto data_out = std::vector(expected.size()); + // auto host_buf = buffer(data_out.data(), data_out.size(), memory_type::host); + // copy(host_buf, buf); + // ASSERT_EQ(cudaStreamSynchronize(execution_stream{}), cudaSuccess); + // EXPECT_THAT(data_out, testing::ElementsAreArray(expected)); +} + +} \ No newline at end of file From 20042b02902351368d5029ed63d90efdb8084abd Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 12 Apr 2023 10:29:45 -0700 Subject: [PATCH 12/75] Debugging --- cpp/include/raft/core/buffer.hpp | 61 +-- .../core/detail/buffer_utils/buffer_copy.hpp | 3 + .../core/detail/buffer_utils/copy_gpu.hpp | 16 +- .../detail/buffer_utils/non_owning_buffer.hpp | 6 +- cpp/include/raft/core/device_support.hpp | 6 +- cpp/test/CMakeLists.txt | 2 +- cpp/test/core/buffer.cpp | 360 ++++++++++++++++++ 7 files changed, 424 insertions(+), 30 deletions(-) create mode 100644 cpp/test/core/buffer.cpp diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index 0f73b6d41c..3cbfc48142 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -14,6 +14,7 @@ * limitations under the License. */ #pragma once +#include "raft/core/logger.hpp" #include #include #include @@ -77,8 +78,6 @@ struct buffer { cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; } @@ -111,15 +110,16 @@ struct buffer { memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); + RAFT_LOG_INFO("DATA_INDEX %d\n", data_.index()); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; } + RAFT_LOG_INFO("result %p\n", result); return result; }()} { + RAFT_LOG_INFO("Non owning constructor called"); } /** @@ -148,12 +148,14 @@ struct buffer { detail::owning_buffer(std::get<1>(device_), other.size(), stream); result_data = buf.get(); result = std::move(buf); - // detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.device_type(), stream); + RAFT_LOG_INFO("gpu copy called"); + detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.dev_type(), stream); } else { auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); - detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.device_type(), stream); + RAFT_LOG_INFO("copy called"); + detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.dev_type(), stream); } return result; }()}, @@ -162,14 +164,13 @@ struct buffer { cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; } return result; }()} { + RAFT_LOG_INFO("Pointer to other's data %p\n", other.data()); } /** @@ -177,7 +178,7 @@ struct buffer { * The memory type of this new buffer will be the same as the original */ buffer(buffer const& other) : buffer(other, - other.memory_type(), + other.mem_type(), other.device_index()) { } @@ -187,11 +188,18 @@ struct buffer { swap(first.device_, second.device_); swap(first.data_, second.data_); swap(first.size_, second.size_); + swap(first.memory_type_, second.memory_type_); swap(first.cached_ptr, second.cached_ptr); } buffer& operator=(buffer other) { - swap(*this, other); + // swap(*this, other); + RAFT_LOG_INFO("EQ Called"); + this -> device_ = other.device_; + // this -> data_ = other.data_; + this -> size_ = other.size_; + this -> memory_type_ = other.memory_type_; + this -> cached_ptr = other.cached_ptr; return *this; } @@ -199,9 +207,9 @@ struct buffer { * @brief Create owning copy of existing buffer with given stream * The device type of this new buffer will be the same as the original */ - buffer(buffer const& other, execution_stream stream) : buffer(other, other.memory_type(), other.device_index(), stream) - { - } + // buffer(buffer const& other, execution_stream stream) : buffer(other, other.mem_type(), other.device_index(), stream) + // { + // } /** * @brief Move from existing buffer unless a copy is necessary based on @@ -219,7 +227,7 @@ struct buffer { }()}, data_{[&other, mem_type, device, stream]() { auto result = data_store{}; - if (mem_type == other.memory_type() && device == other.device_index()) { + if (mem_type == other.mem_type() && device == other.device_index()) { result = std::move(other.data_); } else { auto* result_data = static_cast(nullptr); @@ -227,12 +235,12 @@ struct buffer { auto buf = detail::owning_buffer{device, other.size(), stream}; result_data = buf.get(); result = std::move(buf); - detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.device_type(), stream); + detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.dev_type(), stream); } else { auto buf = detail::owning_buffer{other.size()}; result_data = buf.get(); result = std::move(buf); - detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.device_type(), stream); + detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.dev_type(), stream); } } return result; @@ -242,8 +250,6 @@ struct buffer { cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; } @@ -293,7 +299,16 @@ struct buffer { } auto size() const noexcept { return size_; } - HOST DEVICE auto* data() const noexcept { return cached_ptr; } + HOST DEVICE auto* data() const noexcept { + auto result = static_cast(nullptr); + switch (data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + RAFT_LOG_INFO("data %p; cached_ptr %p\n", result, cached_ptr); + return result;} auto device() const noexcept { return device_; } @@ -307,7 +322,7 @@ struct buffer { return result; } - auto memory_type() const noexcept + auto mem_type() const noexcept { return memory_type_; } @@ -315,7 +330,7 @@ struct buffer { ~buffer() = default; private: - auto device_type() const noexcept + auto dev_type() const noexcept { enum device_type result; if (device_.index() == 0) { @@ -346,8 +361,8 @@ detail::const_agnostic_same_t copy(buffer& dst, throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } - auto src_device_type = is_device_accessible(src.memory_type()) ? device_type::gpu : device_type::cpu; - auto dst_device_type = is_device_accessible(dst.memory_type()) ? device_type::gpu : device_type::cpu; + auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; + auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; detail::buffer_copy(dst.data() + dst_offset, src.data() + src_offset, size, diff --git a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp index 715d65d38d..d1df51272f 100644 --- a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp @@ -14,6 +14,7 @@ * limitations under the License. */ #pragma once +#include "raft/util/cudart_utils.hpp" #include #include #include @@ -70,8 +71,10 @@ void buffer_copy(T* dst, copy( dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { + raft::print_device_vector("dst_1", dst + dst_offset, size, std::cout); copy( dst + dst_offset, src + src_offset, size, stream); + raft::print_device_vector("dst_2", dst + dst_offset, size, std::cout); } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { copy( dst + dst_offset, src + src_offset, size, stream); diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp index 3b5afe7ce0..0f5fbfc97f 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -14,14 +14,20 @@ * limitations under the License. */ #pragma once +#include "thrust/detail/raw_pointer_cast.h" +#include "thrust/detail/tuple.inl" +#include "thrust/iterator/zip_iterator.h" +#include +#include #include +#include #include #include #include #include #include #include - +#include #include #include @@ -36,7 +42,13 @@ std::enable_if_t< void> copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) { - thrust::copy(rmm::exec_policy(stream), src, src + size, dst); + + cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDefault, stream); + // auto it = std::iterator(std::remove_const(src)); + // auto dst_ptr = thrust::device_pointer_cast(dst); + // auto it = thrust::make_zip_iterator(thrust::make_tuple(src)); + // auto v = std::vector {1,2,3}; + // thrust::copy(rmm::exec_policy(stream), v.begin(), v.end(), dst); } } // namespace detail diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index f1e2361ee3..7c64eb33b6 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -14,6 +14,7 @@ * limitations under the License. */ #pragma once +#include "raft/core/logger.hpp" #include #include @@ -24,7 +25,10 @@ struct non_owning_buffer { using value_type = std::remove_const_t; non_owning_buffer() : data_{nullptr} {} - non_owning_buffer(T* ptr) : data_{ptr} {} + non_owning_buffer(T* ptr) : data_{ptr} { + RAFT_LOG_INFO("Address: %p\n", ( void * )data_); + } + auto* get() const { return data_; } diff --git a/cpp/include/raft/core/device_support.hpp b/cpp/include/raft/core/device_support.hpp index ba39c1b29c..8222b3b2b2 100644 --- a/cpp/include/raft/core/device_support.hpp +++ b/cpp/include/raft/core/device_support.hpp @@ -18,10 +18,10 @@ #include namespace raft { -#ifdef RAFT_DISABLE_CUDA -auto constexpr static const CUDA_ENABLED = false; +#ifndef RAFT_DISABLE_CUDA +auto constexpr static const CUDA_ENABLED = true; #else -auto constexpr static const CUDA_ENABLED = true; +auto constexpr static const CUDA_ENABLED = false; #endif #ifdef __CUDACC__ diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 4ce4b96c41..c7c745d2cc 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -93,7 +93,7 @@ if(BUILD_TESTS) NAME CORE_TEST PATH - test/core/buffer.cu + test/core/buffer.cpp test/core/logger.cpp test/core/math_device.cu test/core/math_host.cpp diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp new file mode 100644 index 0000000000..a3e89375df --- /dev/null +++ b/cpp/test/core/buffer.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +namespace raft { + +// TEST(Buffer, default_buffer) +// { +// auto buf = buffer(); +// EXPECT_EQ(buf.mem_type(), memory_type::host); +// EXPECT_EQ(buf.size(), 0); +// EXPECT_EQ(buf.device_index(), 0); +// } + +// TEST(Buffer, device_buffer) +// { +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(data.size(), memory_type::device, 0, execution_stream{}); +// test_buffers.emplace_back(data.size(), memory_type::device, 0); +// test_buffers.emplace_back(data.size(), memory_type::device); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::device); +// ASSERT_EQ(buf.size(), data.size()); +// #ifndef RAFT_DISABLE_CUDA +// ASSERT_NE(buf.data(), nullptr); + +// auto data_out = std::vector(data.size()); +// cudaMemcpy(static_cast(buf.data()), +// static_cast(data.data()), +// sizeof(int) * data.size(), +// cudaMemcpyHostToDevice); +// cudaMemcpy(static_cast(data_out.data()), +// static_cast(buf.data()), +// sizeof(int) * data.size(), +// cudaMemcpyDeviceToHost); +// EXPECT_THAT(data_out, testing::ElementsAreArray(data)); +// #endif +// } +// } + +// TEST(Buffer, non_owning_device_buffer) +// { +// auto data = std::vector{1, 2, 3}; +// auto* ptr_d = static_cast(nullptr); +// #ifndef RAFT_DISABLE_CUDA +// cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); +// cudaMemcpy(static_cast(ptr_d), +// static_cast(data.data()), +// sizeof(int) * data.size(), +// cudaMemcpyHostToDevice); +// #endif +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(ptr_d, data.size(), memory_type::device, 0); +// test_buffers.emplace_back(ptr_d, data.size(), memory_type::device); +// #ifndef RAFT_DISABLE_CUDA + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::device); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_EQ(buf.data(), ptr_d); + +// auto data_out = std::vector(data.size()); +// cudaMemcpy(static_cast(data_out.data()), +// static_cast(buf.data()), +// sizeof(int) * data.size(), +// cudaMemcpyDeviceToHost); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// cudaFree(reinterpret_cast(ptr_d)); +// #endif +// } + +// TEST(Buffer, host_buffer) +// { +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(data.size(), memory_type::host, 0, execution_stream{}); +// test_buffers.emplace_back(data.size(), memory_type::host, 0); +// test_buffers.emplace_back(data.size(), memory_type::host); +// test_buffers.emplace_back(data.size()); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data(), nullptr); + +// std::memcpy( +// static_cast(buf.data()), static_cast(data.data()), data.size() * sizeof(int)); + +// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// } + +// TEST(Buffer, host_buffer_from_iters) +// { +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(std::begin(data), std::end(data)); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data(), nullptr); + +// std::memcpy( +// static_cast(buf.data()), static_cast(data.data()), data.size() * sizeof(int)); + +// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// } + +// TEST(Buffer, device_buffer_from_iters) +// { +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(std::begin(data), std::end(data), memory_type::device); +// test_buffers.emplace_back(std::begin(data), std::end(data), memory_type::device, 0); +// test_buffers.emplace_back(std::begin(data), std::end(data), memory_type::device, 0, execution_stream{}); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::device); +// ASSERT_EQ(buf.size(), data.size()); +// #ifndef RAFT_DISABLE_CUDA +// ASSERT_NE(buf.data(), nullptr); + +// auto data_out = std::vector(data.size()); +// cudaMemcpy(static_cast(buf.data()), +// static_cast(data.data()), +// sizeof(int) * data.size(), +// cudaMemcpyHostToDevice); +// cudaMemcpy(static_cast(data_out.data()), +// static_cast(buf.data()), +// sizeof(int) * data.size(), +// cudaMemcpyDeviceToHost); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// #endif +// } +// } + +TEST(Buffer, non_owning_host_buffer) +{ + auto data = std::vector{1, 2, 3}; + std::vector> test_buffers; + test_buffers.emplace_back(data.data(), data.size(), memory_type::host, 0); + // ASSERT_EQ(test_buffers.back().mem_type(), memory_type::host); + // ASSERT_EQ(test_buffers.back().size(), data.size()); + // ASSERT_EQ(test_buffers.back().data(), data.data()); + test_buffers.emplace_back(data.data(), data.size(), memory_type::host); + // ASSERT_EQ(test_buffers.back().mem_type(), memory_type::host); + // ASSERT_EQ(test_buffers.back().size(), data.size()); + // ASSERT_EQ(test_buffers.back().data(), data.data()); + test_buffers.emplace_back(data.data(), data.size()); + // ASSERT_EQ(test_buffers.back().mem_type(), memory_type::host); + // ASSERT_EQ(test_buffers.back().size(), data.size()); + // ASSERT_EQ(test_buffers.back().data(), data.data()); + + // for (auto& buf : test_buffers) + for (int i = 0; i < 3; i++) { + RAFT_LOG_INFO("memory_type %d\n", test_buffers[i].mem_type()); + ASSERT_EQ(test_buffers[i].mem_type(), memory_type::host); + ASSERT_EQ(test_buffers[i].size(), data.size()); + ASSERT_EQ(test_buffers[i].data(), data.data()); + + // auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +} + +// TEST(Buffer, copy_buffer) +// { +// auto data = std::vector{1, 2, 3}; +// auto orig_buffer = buffer(data.data(), data.size(), memory_type::host); + +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(orig_buffer); +// test_buffers.emplace_back(orig_buffer, memory_type::host); +// test_buffers.emplace_back(orig_buffer, memory_type::host, 0); +// test_buffers.emplace_back(orig_buffer, memory_type::host, 0, execution_stream{}); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data(), orig_buffer.data()); + +// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + +// #ifndef RAFT_DISABLE_CUDA +// auto test_dev_buffers = std::vector>{}; +// test_dev_buffers.emplace_back(orig_buffer, memory_type::device); +// test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0); +// test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0, execution_stream{}); +// for (auto& dev_buf : test_dev_buffers) { +// data_out = std::vector(data.size()); +// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + +// auto test_dev_copies = std::vector>{}; +// test_dev_copies.emplace_back(dev_buf, memory_type::device); +// test_dev_copies.emplace_back(dev_buf, memory_type::device, 0); +// test_dev_copies.emplace_back(dev_buf, memory_type::device, 0, execution_stream{}); +// for (auto& copy_buf : test_dev_copies) { +// data_out = std::vector(data.size()); +// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } + +// auto test_host_buffers = std::vector>{}; +// test_host_buffers.emplace_back(dev_buf, memory_type::host); +// test_host_buffers.emplace_back(dev_buf, memory_type::host, 0); +// test_host_buffers.emplace_back(dev_buf, memory_type::host, 0, execution_stream{}); +// for (auto& host_buf : test_host_buffers) { +// data_out = std::vector(host_buf.data(), host_buf.data() + host_buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// } +// #endif +// } +// } + +// TEST(Buffer, move_buffer) +// { +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host)); +// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host); +// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0); +// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0, execution_stream{}); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_EQ(buf.data(), data.data()); + +// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// #ifndef RAFT_DISABLE_CUDA +// test_buffers = std::vector>{}; +// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device); +// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0); +// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0, execution_stream{}); +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::device); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data(), data.data()); + +// auto data_out = std::vector(buf.size()); +// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data()), buf.size() * sizeof(int), cudaMemcpyDefault)); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// #endif +// } + +// TEST(Buffer, move_assignment_buffer) +// { +// auto data = std::vector{1, 2, 3}; + +// #ifndef RAFT_DISABLE_CUDA +// auto buf = buffer{data.data(), data.size() - 1, memory_type::device}; +// #else +// auto buf = buffer{data.data(), data.size() - 1, memory_type::host}; +// #endif +// buf = buffer{data.size(), memory_type::host}; + +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// } + +// TEST(Buffer, partial_buffer_copy) +// { +// auto data1 = std::vector{1, 2, 3, 4, 5}; +// auto data2 = std::vector{0, 0, 0, 0, 0}; +// auto expected = std::vector{0, 3, 4, 5, 0}; +// #ifndef RAFT_DISABLE_CUDA +// auto buf1 = buffer{buffer{data1.data(), data1.size(), memory_type::host}, memory_type::device}; +// #else +// auto buf1 = buffer{data1.data(), data1.size(), memory_type::host}; +// #endif +// auto buf2 = buffer{data2.data(), data2.size(), memory_type::host}; +// copy(buf2, buf1, 1, 2, 3, execution_stream{}); +// copy(buf2, buf1, 1, 2, 3, execution_stream{}); +// EXPECT_THROW(copy(buf2, buf1, 1, 2, 4, execution_stream{}), out_of_bounds); +// } + +// TEST(Buffer, buffer_copy_overloads) +// { +// auto data = std::vector{1, 2, 3}; +// auto expected = data; +// auto orig_host_buffer = buffer(data.data(), data.size(), memory_type::host); +// auto orig_dev_buffer = buffer(orig_host_buffer, memory_type::device); +// auto copy_dev_buffer = buffer(data.size(), memory_type::device); + +// // copying host to host +// auto data_out = std::vector(data.size()); +// auto copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); +// copy(copy_host_buffer, orig_host_buffer); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// // copying host to host with stream +// data_out = std::vector(data.size()); +// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); +// copy(copy_host_buffer, orig_host_buffer, execution_stream{}); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// // copying host to host with offset +// data_out = std::vector(data.size() + 1); +// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); +// copy(copy_host_buffer, orig_host_buffer, 2, 1, 1, execution_stream{}); +// expected = std::vector{0, 0, 2, 0}; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// #ifndef RAFT_DISABLE_CUDA +// // copy device to host +// data_out = std::vector(data.size()); +// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); +// copy(copy_host_buffer, orig_dev_buffer); +// expected = data; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// // copy device to host with stream +// data_out = std::vector(data.size()); +// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); +// copy(copy_host_buffer, orig_dev_buffer, execution_stream{}); +// expected = data; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// // copy device to host with offset +// data_out = std::vector(data.size() + 1); +// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); +// copy(copy_host_buffer, orig_dev_buffer, 2, 1, 1, execution_stream{}); +// expected = std::vector{0, 0, 2, 0}; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); +// #endif +// } + +} \ No newline at end of file From 2d189c3aa479fee6633cf860ec0ad1e873457bc8 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 19 Apr 2023 12:07:38 -0700 Subject: [PATCH 13/75] Update gtest --- cpp/CMakeLists.txt | 2 +- cpp/include/raft/core/buffer.hpp | 88 +-- .../core/detail/buffer_utils/buffer_copy.hpp | 2 +- .../detail/buffer_utils/non_owning_buffer.hpp | 1 - .../detail/buffer_utils/owning_buffer.hpp | 2 +- cpp/include/raft/core/device_setter.hpp | 2 +- cpp/include/raft/core/device_support.hpp | 2 +- cpp/include/raft/core/execution_device_id.hpp | 2 +- cpp/include/raft/core/execution_stream.hpp | 6 +- cpp/test/core/buffer.cpp | 586 ++++++++---------- 10 files changed, 303 insertions(+), 390 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3d9d7c9419..47140be366 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -252,7 +252,7 @@ endif() # - CUDA-free build support -------------------------------------------------- if (DISABLE_CUDA) - target_compile_definitions(raft INTERFACE RAFT_DISABLE_CUDA) + target_compile_definitions(raft INTERFACE RAFT_DISABLE_GPU) endif() # ################################################################################################## diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index 3cbfc48142..df009637c3 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -89,6 +89,7 @@ struct buffer { /** Construct non-owning buffer */ buffer(T* input_data, index_type size, memory_type mem_type = memory_type::host, int device = 0) : device_{[mem_type, &device]() { + RAFT_LOG_INFO("Non owning constructor call started"); auto result = execution_device_id_variant{}; if (is_device_accessible(mem_type)) { result = execution_device_id{device}; @@ -110,16 +111,16 @@ struct buffer { memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); - RAFT_LOG_INFO("DATA_INDEX %d\n", data_.index()); + RAFT_LOG_INFO("data_index from constructor %d\n", data_.index()); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; } - RAFT_LOG_INFO("result %p\n", result); + RAFT_LOG_INFO("data pointer from constructor %p\n", result); return result; }()} { - RAFT_LOG_INFO("Non owning constructor called"); + RAFT_LOG_INFO("Non owning constructor call complete"); } /** @@ -154,7 +155,7 @@ struct buffer { auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); - RAFT_LOG_INFO("copy called"); + RAFT_LOG_INFO("cpu copy called"); detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.dev_type(), stream); } return result; @@ -173,15 +174,6 @@ struct buffer { RAFT_LOG_INFO("Pointer to other's data %p\n", other.data()); } - /** - * @brief Create owning copy of existing buffer - * The memory type of this new buffer will be the same as the original - */ - buffer(buffer const& other) : buffer(other, - other.mem_type(), - other.device_index()) - { - } friend void swap(buffer& first, buffer& second) { using std::swap; @@ -191,15 +183,9 @@ struct buffer { swap(first.memory_type_, second.memory_type_); swap(first.cached_ptr, second.cached_ptr); } - buffer& operator=(buffer other) - { - // swap(*this, other); - RAFT_LOG_INFO("EQ Called"); - this -> device_ = other.device_; - // this -> data_ = other.data_; - this -> size_ = other.size_; - this -> memory_type_ = other.memory_type_; - this -> cached_ptr = other.cached_ptr; + buffer& operator=(buffer const& other) { + auto copy = other; + swap(*this, copy); return *this; } @@ -207,9 +193,9 @@ struct buffer { * @brief Create owning copy of existing buffer with given stream * The device type of this new buffer will be the same as the original */ - // buffer(buffer const& other, execution_stream stream) : buffer(other, other.mem_type(), other.device_index(), stream) - // { - // } + buffer(buffer const& other, execution_stream stream=execution_stream{}) : buffer(other, other.mem_type(), other.device_index(), stream) + { + } /** * @brief Move from existing buffer unless a copy is necessary based on @@ -256,46 +242,28 @@ struct buffer { return result; }()} { + RAFT_LOG_INFO("original move called"); } - buffer(buffer&& other, device_type mem_type, int device) + buffer(buffer&& other, device_type mem_type, int device=0) : buffer{std::move(other), mem_type, device, execution_stream{}} { + RAFT_LOG_INFO("move constructor without stream called"); } - buffer(buffer&& other, device_type mem_type) - : buffer{std::move(other), mem_type, 0, execution_stream{}} - { - } - - buffer(buffer&& other) : buffer{} { swap(*this, other); } - - template < - typename iter_t, - typename = decltype(*std::declval(), void(), ++std::declval(), void())> - buffer(iter_t const& begin, iter_t const& end) - : buffer{static_cast(std::distance(begin, end))} - { - auto index = std::size_t{}; - std::for_each(begin, end, [&index, this](auto&& val) { data()[index++] = val; }); - } - - template < - typename iter_t, - typename = decltype(*std::declval(), void(), ++std::declval(), void())> - buffer(iter_t const& begin, iter_t const& end, device_type mem_type) - : buffer{buffer{begin, end}, mem_type} - { - } + // buffer(buffer&& other, device_type mem_type) + // : buffer{std::move(other), mem_type, 0, execution_stream{}} + // { + // RAFT_LOG_INFO("copy constructor without stream and device called"); + // } - template < - typename iter_t, - typename = decltype(*std::declval(), void(), ++std::declval(), void())> - buffer(iter_t const& begin, - iter_t const& end, - device_type mem_type, - int device, - execution_stream stream = execution_stream{}) - : buffer{buffer{begin, end}, mem_type, device, stream} - { + buffer(buffer&& other) noexcept + : buffer{std::move(other), other.mem_type(), other.device_index(), execution_stream{}} {} + buffer& operator=(buffer&& other) noexcept { + data_ = std::move(other.data_); + device_ = std::move(other.device_); + size_ = std::move(other.size_); + memory_type_ = std::move(other.memory_type_); + cached_ptr = std::move(other.cached_ptr); + return *this; } auto size() const noexcept { return size_; } diff --git a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp index d1df51272f..06c374b542 100644 --- a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp @@ -18,7 +18,7 @@ #include #include #include -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU #include #endif #include diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index 7c64eb33b6..4ddb294abe 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -26,7 +26,6 @@ struct non_owning_buffer { non_owning_buffer() : data_{nullptr} {} non_owning_buffer(T* ptr) : data_{ptr} { - RAFT_LOG_INFO("Address: %p\n", ( void * )data_); } diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp index c9f1aeca06..c8f8da128d 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp @@ -16,6 +16,6 @@ #pragma once #include "owning_buffer_cpu.hpp" #include -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU #include "owning_buffer_gpu.hpp" #endif \ No newline at end of file diff --git a/cpp/include/raft/core/device_setter.hpp b/cpp/include/raft/core/device_setter.hpp index badf7ae7fc..23c9c91767 100644 --- a/cpp/include/raft/core/device_setter.hpp +++ b/cpp/include/raft/core/device_setter.hpp @@ -15,7 +15,7 @@ */ #pragma once #include -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU #include #endif #include diff --git a/cpp/include/raft/core/device_support.hpp b/cpp/include/raft/core/device_support.hpp index 8222b3b2b2..c0fe74b33d 100644 --- a/cpp/include/raft/core/device_support.hpp +++ b/cpp/include/raft/core/device_support.hpp @@ -18,7 +18,7 @@ #include namespace raft { -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU auto constexpr static const CUDA_ENABLED = true; #else auto constexpr static const CUDA_ENABLED = false; diff --git a/cpp/include/raft/core/execution_device_id.hpp b/cpp/include/raft/core/execution_device_id.hpp index 5c7bae4575..3e98fcdbe4 100644 --- a/cpp/include/raft/core/execution_device_id.hpp +++ b/cpp/include/raft/core/execution_device_id.hpp @@ -17,7 +17,7 @@ #include #include -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU #include #endif #include diff --git a/cpp/include/raft/core/execution_stream.hpp b/cpp/include/raft/core/execution_stream.hpp index 945d6c55b7..cb1e069f4a 100644 --- a/cpp/include/raft/core/execution_stream.hpp +++ b/cpp/include/raft/core/execution_stream.hpp @@ -14,19 +14,19 @@ * limitations under the License. */ #pragma once -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU #include #endif namespace raft { -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU using execution_stream = cudaStream_t; #else using execution_stream = int; #endif inline void synchronize(execution_stream stream) { -#ifndef RAFT_DISABLE_CUDA +#ifndef RAFT_DISABLE_GPU cudaStreamSynchronize(stream); #endif } diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index a3e89375df..a6192ebc36 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -23,338 +23,284 @@ namespace raft { -// TEST(Buffer, default_buffer) -// { -// auto buf = buffer(); -// EXPECT_EQ(buf.mem_type(), memory_type::host); -// EXPECT_EQ(buf.size(), 0); -// EXPECT_EQ(buf.device_index(), 0); -// } - -// TEST(Buffer, device_buffer) -// { -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(data.size(), memory_type::device, 0, execution_stream{}); -// test_buffers.emplace_back(data.size(), memory_type::device, 0); -// test_buffers.emplace_back(data.size(), memory_type::device); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::device); -// ASSERT_EQ(buf.size(), data.size()); -// #ifndef RAFT_DISABLE_CUDA -// ASSERT_NE(buf.data(), nullptr); - -// auto data_out = std::vector(data.size()); -// cudaMemcpy(static_cast(buf.data()), -// static_cast(data.data()), -// sizeof(int) * data.size(), -// cudaMemcpyHostToDevice); -// cudaMemcpy(static_cast(data_out.data()), -// static_cast(buf.data()), -// sizeof(int) * data.size(), -// cudaMemcpyDeviceToHost); -// EXPECT_THAT(data_out, testing::ElementsAreArray(data)); -// #endif -// } -// } - -// TEST(Buffer, non_owning_device_buffer) -// { -// auto data = std::vector{1, 2, 3}; -// auto* ptr_d = static_cast(nullptr); -// #ifndef RAFT_DISABLE_CUDA -// cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); -// cudaMemcpy(static_cast(ptr_d), -// static_cast(data.data()), -// sizeof(int) * data.size(), -// cudaMemcpyHostToDevice); -// #endif -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(ptr_d, data.size(), memory_type::device, 0); -// test_buffers.emplace_back(ptr_d, data.size(), memory_type::device); -// #ifndef RAFT_DISABLE_CUDA - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::device); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_EQ(buf.data(), ptr_d); - -// auto data_out = std::vector(data.size()); -// cudaMemcpy(static_cast(data_out.data()), -// static_cast(buf.data()), -// sizeof(int) * data.size(), -// cudaMemcpyDeviceToHost); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// cudaFree(reinterpret_cast(ptr_d)); -// #endif -// } - -// TEST(Buffer, host_buffer) -// { -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(data.size(), memory_type::host, 0, execution_stream{}); -// test_buffers.emplace_back(data.size(), memory_type::host, 0); -// test_buffers.emplace_back(data.size(), memory_type::host); -// test_buffers.emplace_back(data.size()); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data(), nullptr); - -// std::memcpy( -// static_cast(buf.data()), static_cast(data.data()), data.size() * sizeof(int)); - -// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// } - -// TEST(Buffer, host_buffer_from_iters) -// { -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(std::begin(data), std::end(data)); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data(), nullptr); - -// std::memcpy( -// static_cast(buf.data()), static_cast(data.data()), data.size() * sizeof(int)); - -// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// } - -// TEST(Buffer, device_buffer_from_iters) -// { -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(std::begin(data), std::end(data), memory_type::device); -// test_buffers.emplace_back(std::begin(data), std::end(data), memory_type::device, 0); -// test_buffers.emplace_back(std::begin(data), std::end(data), memory_type::device, 0, execution_stream{}); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::device); -// ASSERT_EQ(buf.size(), data.size()); -// #ifndef RAFT_DISABLE_CUDA -// ASSERT_NE(buf.data(), nullptr); - -// auto data_out = std::vector(data.size()); -// cudaMemcpy(static_cast(buf.data()), -// static_cast(data.data()), -// sizeof(int) * data.size(), -// cudaMemcpyHostToDevice); -// cudaMemcpy(static_cast(data_out.data()), -// static_cast(buf.data()), -// sizeof(int) * data.size(), -// cudaMemcpyDeviceToHost); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// #endif -// } -// } +TEST(Buffer, default_buffer) +{ + auto buf = buffer(); + EXPECT_EQ(buf.mem_type(), memory_type::host); + EXPECT_EQ(buf.size(), 0); + EXPECT_EQ(buf.device_index(), 0); +} + +TEST(Buffer, device_buffer) +{ + auto data = std::vector{1, 2, 3}; + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(data.size(), memory_type::device, 0, execution_stream{}); + test_buffers.emplace_back(data.size(), memory_type::device, 0); + test_buffers.emplace_back(data.size(), memory_type::device); + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_EQ(buf.size(), data.size()); +#ifndef RAFT_DISABLE_GPU + ASSERT_NE(buf.data(), nullptr); + + auto data_out = std::vector(data.size()); + cudaMemcpy(static_cast(buf.data()), + static_cast(data.data()), + sizeof(int) * data.size(), + cudaMemcpyHostToDevice); + cudaMemcpy(static_cast(data_out.data()), + static_cast(buf.data()), + sizeof(int) * data.size(), + cudaMemcpyDeviceToHost); + EXPECT_THAT(data_out, testing::ElementsAreArray(data)); +#endif + } +} + +TEST(Buffer, non_owning_device_buffer) +{ + auto data = std::vector{1, 2, 3}; + auto* ptr_d = static_cast(nullptr); +#ifndef RAFT_DISABLE_GPU + cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); + cudaMemcpy(static_cast(ptr_d), + static_cast(data.data()), + sizeof(int) * data.size(), + cudaMemcpyHostToDevice); +#endif + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(ptr_d, data.size(), memory_type::device, 0); + test_buffers.emplace_back(ptr_d, data.size(), memory_type::device); +#ifndef RAFT_DISABLE_GPU + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_EQ(buf.data(), ptr_d); + + auto data_out = std::vector(data.size()); + cudaMemcpy(static_cast(data_out.data()), + static_cast(buf.data()), + sizeof(int) * data.size(), + cudaMemcpyDeviceToHost); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } + cudaFree(reinterpret_cast(ptr_d)); +#endif +} + +TEST(Buffer, host_buffer) +{ + auto data = std::vector{1, 2, 3}; + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(data.size(), memory_type::host, 0, execution_stream{}); + test_buffers.emplace_back(data.size(), memory_type::host, 0); + test_buffers.emplace_back(data.size(), memory_type::host); + test_buffers.emplace_back(data.size()); + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_NE(buf.data(), nullptr); + + std::memcpy( + static_cast(buf.data()), static_cast(data.data()), data.size() * sizeof(int)); + + auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +} TEST(Buffer, non_owning_host_buffer) { auto data = std::vector{1, 2, 3}; std::vector> test_buffers; test_buffers.emplace_back(data.data(), data.size(), memory_type::host, 0); - // ASSERT_EQ(test_buffers.back().mem_type(), memory_type::host); - // ASSERT_EQ(test_buffers.back().size(), data.size()); - // ASSERT_EQ(test_buffers.back().data(), data.data()); test_buffers.emplace_back(data.data(), data.size(), memory_type::host); - // ASSERT_EQ(test_buffers.back().mem_type(), memory_type::host); - // ASSERT_EQ(test_buffers.back().size(), data.size()); - // ASSERT_EQ(test_buffers.back().data(), data.data()); test_buffers.emplace_back(data.data(), data.size()); - // ASSERT_EQ(test_buffers.back().mem_type(), memory_type::host); - // ASSERT_EQ(test_buffers.back().size(), data.size()); - // ASSERT_EQ(test_buffers.back().data(), data.data()); - - // for (auto& buf : test_buffers) - for (int i = 0; i < 3; i++) { - RAFT_LOG_INFO("memory_type %d\n", test_buffers[i].mem_type()); - ASSERT_EQ(test_buffers[i].mem_type(), memory_type::host); - ASSERT_EQ(test_buffers[i].size(), data.size()); - ASSERT_EQ(test_buffers[i].data(), data.data()); - - // auto data_out = std::vector(buf.data(), buf.data() + buf.size()); - // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_EQ(buf.data(), data.data()); + + auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } } -// TEST(Buffer, copy_buffer) -// { -// auto data = std::vector{1, 2, 3}; -// auto orig_buffer = buffer(data.data(), data.size(), memory_type::host); - -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(orig_buffer); -// test_buffers.emplace_back(orig_buffer, memory_type::host); -// test_buffers.emplace_back(orig_buffer, memory_type::host, 0); -// test_buffers.emplace_back(orig_buffer, memory_type::host, 0, execution_stream{}); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data(), orig_buffer.data()); - -// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - -// #ifndef RAFT_DISABLE_CUDA -// auto test_dev_buffers = std::vector>{}; -// test_dev_buffers.emplace_back(orig_buffer, memory_type::device); -// test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0); -// test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0, execution_stream{}); -// for (auto& dev_buf : test_dev_buffers) { -// data_out = std::vector(data.size()); -// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - -// auto test_dev_copies = std::vector>{}; -// test_dev_copies.emplace_back(dev_buf, memory_type::device); -// test_dev_copies.emplace_back(dev_buf, memory_type::device, 0); -// test_dev_copies.emplace_back(dev_buf, memory_type::device, 0, execution_stream{}); -// for (auto& copy_buf : test_dev_copies) { -// data_out = std::vector(data.size()); -// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } - -// auto test_host_buffers = std::vector>{}; -// test_host_buffers.emplace_back(dev_buf, memory_type::host); -// test_host_buffers.emplace_back(dev_buf, memory_type::host, 0); -// test_host_buffers.emplace_back(dev_buf, memory_type::host, 0, execution_stream{}); -// for (auto& host_buf : test_host_buffers) { -// data_out = std::vector(host_buf.data(), host_buf.data() + host_buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// } -// #endif -// } -// } - -// TEST(Buffer, move_buffer) -// { -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host)); -// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host); -// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0); -// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0, execution_stream{}); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_EQ(buf.data(), data.data()); - -// auto data_out = std::vector(buf.data(), buf.data() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// #ifndef RAFT_DISABLE_CUDA -// test_buffers = std::vector>{}; -// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device); -// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0); -// test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0, execution_stream{}); -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::device); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data(), data.data()); - -// auto data_out = std::vector(buf.size()); -// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data()), buf.size() * sizeof(int), cudaMemcpyDefault)); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// #endif -// } - -// TEST(Buffer, move_assignment_buffer) -// { -// auto data = std::vector{1, 2, 3}; - -// #ifndef RAFT_DISABLE_CUDA -// auto buf = buffer{data.data(), data.size() - 1, memory_type::device}; -// #else -// auto buf = buffer{data.data(), data.size() - 1, memory_type::host}; -// #endif -// buf = buffer{data.size(), memory_type::host}; - -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// } - -// TEST(Buffer, partial_buffer_copy) -// { -// auto data1 = std::vector{1, 2, 3, 4, 5}; -// auto data2 = std::vector{0, 0, 0, 0, 0}; -// auto expected = std::vector{0, 3, 4, 5, 0}; -// #ifndef RAFT_DISABLE_CUDA -// auto buf1 = buffer{buffer{data1.data(), data1.size(), memory_type::host}, memory_type::device}; -// #else -// auto buf1 = buffer{data1.data(), data1.size(), memory_type::host}; -// #endif -// auto buf2 = buffer{data2.data(), data2.size(), memory_type::host}; -// copy(buf2, buf1, 1, 2, 3, execution_stream{}); -// copy(buf2, buf1, 1, 2, 3, execution_stream{}); -// EXPECT_THROW(copy(buf2, buf1, 1, 2, 4, execution_stream{}), out_of_bounds); -// } - -// TEST(Buffer, buffer_copy_overloads) -// { -// auto data = std::vector{1, 2, 3}; -// auto expected = data; -// auto orig_host_buffer = buffer(data.data(), data.size(), memory_type::host); -// auto orig_dev_buffer = buffer(orig_host_buffer, memory_type::device); -// auto copy_dev_buffer = buffer(data.size(), memory_type::device); +TEST(Buffer, copy_constructor) +{ + auto data = std::vector{1, 2, 3}; + buffer const orig_buffer = buffer(data.data(), data.size(), memory_type::host); + + // host to host copy operations + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(orig_buffer); + test_buffers.emplace_back(orig_buffer, memory_type::host); + test_buffers.emplace_back(orig_buffer, memory_type::host, 0); + test_buffers.emplace_back(orig_buffer, memory_type::host, 0, execution_stream{}); + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_NE(buf.data(), orig_buffer.data()); + + auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + +#ifndef RAFT_DISABLE_GPU + // host to device copy operations + auto test_dev_buffers = std::vector>{}; + test_dev_buffers.emplace_back(orig_buffer, memory_type::device); + test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0); + test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0, execution_stream{}); + for (auto& dev_buf : test_dev_buffers) { + data_out = std::vector(data.size()); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + + // device to device copy operations + auto test_dev_copies = std::vector>{}; + test_dev_copies.emplace_back(dev_buf, memory_type::device); + test_dev_copies.emplace_back(dev_buf, memory_type::device, 0); + test_dev_copies.emplace_back(dev_buf, memory_type::device, 0, execution_stream{}); + for (auto& copy_buf : test_dev_copies) { + data_out = std::vector(data.size()); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } + + // device to host copy operations + auto test_host_buffers = std::vector>{}; + test_host_buffers.emplace_back(dev_buf, memory_type::host); + test_host_buffers.emplace_back(dev_buf, memory_type::host, 0); + test_host_buffers.emplace_back(dev_buf, memory_type::host, 0, execution_stream{}); + for (auto& host_buf : test_host_buffers) { + data_out = std::vector(host_buf.data(), host_buf.data() + host_buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } + } +#endif + } +} + +TEST(Buffer, move_buffer) +{ + auto data = std::vector{1, 2, 3}; + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host)); + test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host); + test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0); + test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0, execution_stream{}); + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_EQ(buf.data(), data.data()); + + auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +#ifndef RAFT_DISABLE_GPU + test_buffers = std::vector>{}; + test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device); + test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0); + test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0, execution_stream{}); + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_NE(buf.data(), data.data()); + + auto data_out = std::vector(buf.size()); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data()), buf.size() * sizeof(int), cudaMemcpyDefault)); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +#endif +} + +TEST(Buffer, move_assignment_buffer) +{ + auto data = std::vector{1, 2, 3}; + +#ifndef RAFT_DISABLE_GPU + auto buf = buffer{data.data(), data.size() - 1, memory_type::device}; +#else + auto buf = buffer{data.data(), data.size() - 1, memory_type::host}; +#endif + buf = buffer{data.size(), memory_type::host}; + + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); +} + +TEST(Buffer, partial_buffer_copy) +{ + auto data1 = std::vector{1, 2, 3, 4, 5}; + auto data2 = std::vector{0, 0, 0, 0, 0}; + auto expected = std::vector{0, 3, 4, 5, 0}; +#ifndef RAFT_DISABLE_GPU + auto buf1 = buffer{buffer{data1.data(), data1.size(), memory_type::host}, memory_type::device}; +#else + auto buf1 = buffer{data1.data(), data1.size(), memory_type::host}; +#endif + auto buf2 = buffer{data2.data(), data2.size(), memory_type::host}; + copy(buf2, buf1, 1, 2, 3, execution_stream{}); + copy(buf2, buf1, 1, 2, 3, execution_stream{}); + EXPECT_THROW(copy(buf2, buf1, 1, 2, 4, execution_stream{}), out_of_bounds); +} + +TEST(Buffer, buffer_copy_overloads) +{ + auto data = std::vector{1, 2, 3}; + auto expected = data; + auto orig_host_buffer = buffer(data.data(), data.size(), memory_type::host); + auto orig_dev_buffer = buffer(orig_host_buffer, memory_type::device); + auto copy_dev_buffer = buffer(data.size(), memory_type::device); -// // copying host to host -// auto data_out = std::vector(data.size()); -// auto copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); -// copy(copy_host_buffer, orig_host_buffer); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -// // copying host to host with stream -// data_out = std::vector(data.size()); -// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); -// copy(copy_host_buffer, orig_host_buffer, execution_stream{}); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -// // copying host to host with offset -// data_out = std::vector(data.size() + 1); -// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); -// copy(copy_host_buffer, orig_host_buffer, 2, 1, 1, execution_stream{}); -// expected = std::vector{0, 0, 2, 0}; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -// #ifndef RAFT_DISABLE_CUDA -// // copy device to host -// data_out = std::vector(data.size()); -// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); -// copy(copy_host_buffer, orig_dev_buffer); -// expected = data; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -// // copy device to host with stream -// data_out = std::vector(data.size()); -// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); -// copy(copy_host_buffer, orig_dev_buffer, execution_stream{}); -// expected = data; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + // copying host to host + auto data_out = std::vector(data.size()); + auto copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); + copy(copy_host_buffer, orig_host_buffer); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + + // copying host to host with stream + data_out = std::vector(data.size()); + copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); + copy(copy_host_buffer, orig_host_buffer, execution_stream{}); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + + // copying host to host with offset + data_out = std::vector(data.size() + 1); + copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); + copy(copy_host_buffer, orig_host_buffer, 2, 1, 1, execution_stream{}); + expected = std::vector{0, 0, 2, 0}; + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +#ifndef RAFT_DISABLE_GPU + // copy device to host + data_out = std::vector(data.size()); + copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); + copy(copy_host_buffer, orig_dev_buffer); + expected = data; + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + + // copy device to host with stream + data_out = std::vector(data.size()); + copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); + copy(copy_host_buffer, orig_dev_buffer, execution_stream{}); + expected = data; + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); -// // copy device to host with offset -// data_out = std::vector(data.size() + 1); -// copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); -// copy(copy_host_buffer, orig_dev_buffer, 2, 1, 1, execution_stream{}); -// expected = std::vector{0, 0, 2, 0}; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); -// #endif -// } + // copy device to host with offset + data_out = std::vector(data.size() + 1); + copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); + copy(copy_host_buffer, orig_dev_buffer, 2, 1, 1, execution_stream{}); + expected = std::vector{0, 0, 2, 0}; + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); +#endif +} } \ No newline at end of file From 2f8b294459abfea7adbf096b14d4c305c41af77b Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 27 Apr 2023 13:01:57 -0700 Subject: [PATCH 14/75] Some updates after reviews --- cpp/include/raft/core/buffer.hpp | 16 ++++++---------- .../raft/core/detail/buffer_utils/copy_gpu.hpp | 17 ++++++++++------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index df009637c3..84978fda93 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -37,12 +37,8 @@ namespace raft { * @brief A container which may or may not own its own data on host or device * */ -using index_type = std::size_t; template struct buffer { - using index_type = std::size_t; - using value_type = T; - using data_store = std::variant, detail::non_owning_buffer, detail::owning_buffer, @@ -51,7 +47,7 @@ struct buffer { buffer() : device_{}, data_{}, size_{}, memory_type_{memory_type::host} {} /** Construct non-initialized owning buffer */ - buffer(index_type size, + buffer(size_t size, memory_type mem_type = memory_type::host, int device = 0, execution_stream stream = 0) @@ -87,7 +83,7 @@ struct buffer { } /** Construct non-owning buffer */ - buffer(T* input_data, index_type size, memory_type mem_type = memory_type::host, int device = 0) + buffer(T* input_data, size_t size, memory_type mem_type = memory_type::host, int device = 0) : device_{[mem_type, &device]() { RAFT_LOG_INFO("Non owning constructor call started"); auto result = execution_device_id_variant{}; @@ -311,7 +307,7 @@ struct buffer { execution_device_id_variant device_; data_store data_; - index_type size_; + size_t size_; enum memory_type memory_type_; T* cached_ptr; }; @@ -319,9 +315,9 @@ struct buffer { template detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, - typename buffer::index_type dst_offset, - typename buffer::index_type src_offset, - typename buffer::index_type size, + size_t dst_offset, + size_t src_offset, + size_t size, execution_stream stream) { if constexpr (bounds_check) { diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp index 0f5fbfc97f..a10e593c0d 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -42,13 +42,16 @@ std::enable_if_t< void> copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) { - - cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDefault, stream); - // auto it = std::iterator(std::remove_const(src)); - // auto dst_ptr = thrust::device_pointer_cast(dst); - // auto it = thrust::make_zip_iterator(thrust::make_tuple(src)); - // auto v = std::vector {1,2,3}; - // thrust::copy(rmm::exec_policy(stream), v.begin(), v.end(), dst); + if (src_type == device_type::cpu) { + raft::update_device(dst, src, size, stream); + } + else if (dst_type == device_type::cpu) { + raft::update_host(dst, src, size, stream); + cudaDeviceSynchronize(); + } + else { + raft::copy_async(dst, src, size, stream); + } } } // namespace detail From 6539ef479b607181398f7e4ab8b64b8af287110d Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 28 Apr 2023 11:50:51 -0700 Subject: [PATCH 15/75] Use raft::resources --- cpp/include/raft/core/buffer.hpp | 101 ++++++++------ .../core/detail/buffer_utils/buffer_copy.hpp | 57 +++----- .../core/detail/buffer_utils/copy_cpu.hpp | 10 +- .../core/detail/buffer_utils/copy_gpu.hpp | 11 +- .../buffer_utils/owning_buffer_base.hpp | 4 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 11 +- cpp/include/raft/core/execution_stream.hpp | 33 ----- cpp/test/core/buffer.cpp | 127 ++++++++++-------- cpp/test/core/buffer.cu | 10 +- 9 files changed, 174 insertions(+), 190 deletions(-) delete mode 100644 cpp/include/raft/core/execution_stream.hpp diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index 84978fda93..ecdae194e1 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -26,8 +26,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -47,10 +47,10 @@ struct buffer { buffer() : device_{}, data_{}, size_{}, memory_type_{memory_type::host} {} /** Construct non-initialized owning buffer */ - buffer(size_t size, + buffer(raft::resources const& handle, + size_t size, memory_type mem_type = memory_type::host, - int device = 0, - execution_stream stream = 0) + int device = 0) : device_{[mem_type, &device]() { auto result = execution_device_id_variant{}; if (is_device_accessible(mem_type)) { @@ -60,10 +60,10 @@ struct buffer { } return result; }()}, - data_{[this, mem_type, size, stream]() { + data_{[this, mem_type, size, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{std::get<1>(device_), size, stream}; + result = detail::owning_buffer{handle, std::get<1>(device_), size}; } else { result = detail::owning_buffer{size}; } @@ -83,7 +83,7 @@ struct buffer { } /** Construct non-owning buffer */ - buffer(T* input_data, size_t size, memory_type mem_type = memory_type::host, int device = 0) + buffer(raft::resources const& handle, T* input_data, size_t size, memory_type mem_type = memory_type::host, int device = 0) : device_{[mem_type, &device]() { RAFT_LOG_INFO("Non owning constructor call started"); auto result = execution_device_id_variant{}; @@ -124,10 +124,10 @@ struct buffer { * A buffer constructed in this way is owning and will copy the data from * the original location */ - buffer(buffer const& other, + buffer(raft::resources const& handle, + buffer const& other, memory_type mem_type, - int device = 0, - execution_stream stream = execution_stream{}) + int device = 0) : device_{[mem_type, &device]() { auto result = execution_device_id_variant{}; if (is_device_accessible(mem_type)) { @@ -137,22 +137,22 @@ struct buffer { } return result; }()}, - data_{[this, &other, mem_type, device, stream]() { + data_{[this, &other, mem_type, device, handle]() { auto result = data_store{}; auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = - detail::owning_buffer(std::get<1>(device_), other.size(), stream); + detail::owning_buffer(handle, std::get<1>(device_), other.size()); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("gpu copy called"); - detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.dev_type(), stream); + detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::gpu, other.dev_type()); } else { auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("cpu copy called"); - detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.dev_type(), stream); + detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::cpu, other.dev_type()); } return result; }()}, @@ -189,7 +189,7 @@ struct buffer { * @brief Create owning copy of existing buffer with given stream * The device type of this new buffer will be the same as the original */ - buffer(buffer const& other, execution_stream stream=execution_stream{}) : buffer(other, other.mem_type(), other.device_index(), stream) + buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type(), other.device_index()) { } @@ -197,7 +197,7 @@ struct buffer { * @brief Move from existing buffer unless a copy is necessary based on * memory location */ - buffer(buffer&& other, memory_type mem_type, int device, execution_stream stream) + buffer(raft::resources const& handle, buffer&& other, memory_type mem_type, int device) : device_{[mem_type, &device]() { auto result = execution_device_id_variant{}; if (is_device_accessible(mem_type)) { @@ -207,22 +207,22 @@ struct buffer { } return result; }()}, - data_{[&other, mem_type, device, stream]() { + data_{[&other, mem_type, device, handle]() { auto result = data_store{}; if (mem_type == other.mem_type() && device == other.device_index()) { result = std::move(other.data_); } else { auto* result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { - auto buf = detail::owning_buffer{device, other.size(), stream}; + auto buf = detail::owning_buffer{handle, device, other.size()}; result_data = buf.get(); result = std::move(buf); - detail::buffer_copy(result_data, other.data(), other.size(), device_type::gpu, other.dev_type(), stream); + detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::gpu, other.dev_type()); } else { auto buf = detail::owning_buffer{other.size()}; result_data = buf.get(); result = std::move(buf); - detail::buffer_copy(result_data, other.data(), other.size(), device_type::cpu, other.dev_type(), stream); + detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::cpu, other.dev_type()); } } return result; @@ -232,6 +232,8 @@ struct buffer { cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; } @@ -240,8 +242,8 @@ struct buffer { { RAFT_LOG_INFO("original move called"); } - buffer(buffer&& other, device_type mem_type, int device=0) - : buffer{std::move(other), mem_type, device, execution_stream{}} + buffer(raft::resources const& handle, buffer&& other, device_type mem_type, int device=0) + : buffer{handle, std::move(other), mem_type, device} { RAFT_LOG_INFO("move constructor without stream called"); } @@ -252,7 +254,35 @@ struct buffer { // } buffer(buffer&& other) noexcept - : buffer{std::move(other), other.mem_type(), other.device_index(), execution_stream{}} {} + : device_{[&other]() { + auto result = execution_device_id_variant{}; + if (is_device_accessible(other.mem_type())) { + result = execution_device_id{other.device_index()}; + } else { + result = execution_device_id{other.device_index()}; + } + return result; + }()}, + data_{[&other]() { + auto result = data_store{}; + result = std::move(other.data_); + return result; + }()}, + size_{other.size()}, + memory_type_{other.mem_type()}, + cached_ptr{[this]() { + auto result = static_cast(nullptr); + switch (data_.index()) { + case 0: result = std::get<0>(data_).get(); break; + case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; + } + return result; + }()} + { + RAFT_LOG_INFO("trivial move called"); + } buffer& operator=(buffer&& other) noexcept { data_ = std::move(other.data_); device_ = std::move(other.device_); @@ -313,12 +343,12 @@ struct buffer { }; template -detail::const_agnostic_same_t copy(buffer& dst, +detail::const_agnostic_same_t copy(raft::resources const& handle, + buffer& dst, buffer const& src, size_t dst_offset, size_t src_offset, - size_t size, - execution_stream stream) + size_t size) { if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { @@ -327,24 +357,19 @@ detail::const_agnostic_same_t copy(buffer& dst, } auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; - detail::buffer_copy(dst.data() + dst_offset, + detail::buffer_copy(handle, + dst.data() + dst_offset, src.data() + src_offset, size, dst_device_type, - src_device_type, - stream); + src_device_type); } template -detail::const_agnostic_same_t copy(buffer& dst, - buffer const& src, - execution_stream stream) -{ - copy(dst, src, 0, 0, src.size(), stream); -} -template -detail::const_agnostic_same_t copy(buffer& dst, buffer const& src) +detail::const_agnostic_same_t copy(raft::resources const& handle, + buffer& dst, + buffer const& src) { - copy(dst, src, 0, 0, src.size(), execution_stream{}); + copy(handle, dst, src, 0, 0, src.size()); } } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp index 06c374b542..3ec58d65a5 100644 --- a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp @@ -14,88 +14,69 @@ * limitations under the License. */ #pragma once -#include "raft/util/cudart_utils.hpp" +#include #include #include -#include #ifndef RAFT_DISABLE_GPU #include #endif #include - +#include namespace raft { namespace detail { template -void buffer_copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) -{ - copy(dst + dst_offset, src + src_offset, size, execution_stream{}); -} - -template -void buffer_copy(T* dst, +void buffer_copy(raft::resources const& handle, + T* dst, T const* src, uint32_t size, uint32_t dst_offset, - uint32_t src_offset, - execution_stream stream) -{ - copy(dst + dst_offset, src + src_offset, size, stream); -} - -template -void buffer_copy(T* dst, T const* src, uint32_t size) + uint32_t src_offset) { - copy(dst, src, size, execution_stream{}); + copy(handle, dst + dst_offset, src + src_offset, size); } template -void buffer_copy(T* dst, T const* src, uint32_t size, execution_stream stream) +void buffer_copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) { - copy(dst, src, size, stream); + copy(handle, dst, src, size); } template -void buffer_copy(T* dst, +void buffer_copy(raft::resources const& handle, + T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type, uint32_t dst_offset, - uint32_t src_offset, - execution_stream stream) + uint32_t src_offset) { if (dst_type == device_type::gpu && src_type == device_type::gpu) { copy( - dst + dst_offset, src + src_offset, size, stream); + handle, dst + dst_offset, src + src_offset, size); } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { copy( - dst + dst_offset, src + src_offset, size, stream); + handle, dst + dst_offset, src + src_offset, size); } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { raft::print_device_vector("dst_1", dst + dst_offset, size, std::cout); copy( - dst + dst_offset, src + src_offset, size, stream); + handle, dst + dst_offset, src + src_offset, size); raft::print_device_vector("dst_2", dst + dst_offset, size, std::cout); } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { copy( - dst + dst_offset, src + src_offset, size, stream); + handle, dst + dst_offset, src + src_offset, size); } } template -void buffer_copy(T* dst, T const* src, uint32_t size, device_type dst_type, device_type src_type) -{ - buffer_copy(dst, src, size, dst_type, src_type, 0, 0, execution_stream{}); -} - -template -void buffer_copy(T* dst, +void buffer_copy(raft::resources const& handle, + T* dst, T const* src, uint32_t size, device_type dst_type, - device_type src_type, - execution_stream stream) + device_type src_type) { - buffer_copy(dst, src, size, dst_type, src_type, 0, 0, stream); + buffer_copy(handle, dst, src, size, dst_type, src_type, 0, 0); } } // namespace detail } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp index 2555c251b3..5f879710fb 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include namespace raft { namespace detail { @@ -27,18 +27,18 @@ template std::enable_if_t, std::bool_constant>, void> -copy(T* dst, T const* src, uint32_t size, execution_stream stream) +copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) { std::copy(src, src + size, dst); } template std::enable_if_t< - std::conjunction_v, - std::bool_constant>, + std::conjunction_v, + std::bool_constant>, std::bool_constant>, void> -copy(T* dst, T const* src, uint32_t size, execution_stream stream) +copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) { throw raft::cuda_unsupported("Copying from or to device in non-GPU build"); } diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp index a10e593c0d..f1f4d8b102 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -14,6 +14,7 @@ * limitations under the License. */ #pragma once +#include "raft/core/resource/cuda_stream.hpp" #include "thrust/detail/raw_pointer_cast.h" #include "thrust/detail/tuple.inl" #include "thrust/iterator/zip_iterator.h" @@ -23,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -40,17 +41,17 @@ std::enable_if_t< std::bool_constant>, std::bool_constant>, void> -copy(T* dst, T const* src, uint32_t size, raft::execution_stream stream) +copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) { if (src_type == device_type::cpu) { - raft::update_device(dst, src, size, stream); + raft::update_device(dst, src, size, raft::resource::get_cuda_stream(handle)); } else if (dst_type == device_type::cpu) { - raft::update_host(dst, src, size, stream); + raft::update_host(dst, src, size, raft::resource::get_cuda_stream(handle)); cudaDeviceSynchronize(); } else { - raft::copy_async(dst, src, size, stream); + raft::copy_async(dst, src, size, raft::resource::get_cuda_stream(handle)); } } diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp index c112844a3a..61cee5aa6e 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -16,7 +16,7 @@ #pragma once #include #include -#include +#include #include namespace raft { @@ -25,7 +25,7 @@ namespace detail { template struct owning_buffer { owning_buffer() {} - owning_buffer(execution_device_id device_id, std::size_t size, execution_stream stream) {} + owning_buffer(raft::resources const& handle, execution_device_id device_id, std::size_t size) {} auto* get() const { return static_cast(nullptr); } }; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 662d4caeae..81a0f611bf 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include namespace raft { @@ -28,12 +29,12 @@ struct owning_buffer { using value_type = std::remove_const_t; owning_buffer() : data_{} {} - owning_buffer(execution_device_id execution_device_id, - std::size_t size, - cudaStream_t stream) noexcept(false) - : data_{[&execution_device_id, &size, &stream]() { + owning_buffer(raft::resources const& handle, + execution_device_id execution_device_id, + std::size_t size) noexcept(false) + : data_{[&execution_device_id, &size, handle]() { auto device_context = device_setter{execution_device_id}; - return rmm::device_buffer{size * sizeof(value_type), rmm::cuda_stream_view{stream}}; + return rmm::device_buffer{size * sizeof(value_type), raft::resource::get_cuda_stream(handle)}; }()} { } diff --git a/cpp/include/raft/core/execution_stream.hpp b/cpp/include/raft/core/execution_stream.hpp deleted file mode 100644 index cb1e069f4a..0000000000 --- a/cpp/include/raft/core/execution_stream.hpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#ifndef RAFT_DISABLE_GPU -#include -#endif - -namespace raft { -#ifndef RAFT_DISABLE_GPU -using execution_stream = cudaStream_t; -#else -using execution_stream = int; -#endif -inline void synchronize(execution_stream stream) -{ -#ifndef RAFT_DISABLE_GPU - cudaStreamSynchronize(stream); -#endif -} -} // namespace raft \ No newline at end of file diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index a6192ebc36..a277a9b3ea 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include @@ -33,11 +33,12 @@ TEST(Buffer, default_buffer) TEST(Buffer, device_buffer) { + raft::resources handle; auto data = std::vector{1, 2, 3}; auto test_buffers = std::vector>{}; - test_buffers.emplace_back(data.size(), memory_type::device, 0, execution_stream{}); - test_buffers.emplace_back(data.size(), memory_type::device, 0); - test_buffers.emplace_back(data.size(), memory_type::device); + test_buffers.emplace_back(handle, data.size(), memory_type::device, 0); + test_buffers.emplace_back(handle, data.size(), memory_type::device, 0); + test_buffers.emplace_back(handle, data.size(), memory_type::device); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::device); @@ -61,6 +62,7 @@ TEST(Buffer, device_buffer) TEST(Buffer, non_owning_device_buffer) { + raft::resources handle; auto data = std::vector{1, 2, 3}; auto* ptr_d = static_cast(nullptr); #ifndef RAFT_DISABLE_GPU @@ -71,8 +73,8 @@ TEST(Buffer, non_owning_device_buffer) cudaMemcpyHostToDevice); #endif auto test_buffers = std::vector>{}; - test_buffers.emplace_back(ptr_d, data.size(), memory_type::device, 0); - test_buffers.emplace_back(ptr_d, data.size(), memory_type::device); + test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device, 0); + test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); #ifndef RAFT_DISABLE_GPU for (auto& buf : test_buffers) { @@ -92,13 +94,14 @@ TEST(Buffer, non_owning_device_buffer) } TEST(Buffer, host_buffer) -{ +{ + raft::resources handle; auto data = std::vector{1, 2, 3}; auto test_buffers = std::vector>{}; - test_buffers.emplace_back(data.size(), memory_type::host, 0, execution_stream{}); - test_buffers.emplace_back(data.size(), memory_type::host, 0); - test_buffers.emplace_back(data.size(), memory_type::host); - test_buffers.emplace_back(data.size()); + test_buffers.emplace_back(handle, data.size(), memory_type::host, 0); + test_buffers.emplace_back(handle, data.size(), memory_type::host, 0); + test_buffers.emplace_back(handle, data.size(), memory_type::host); + test_buffers.emplace_back(handle, data.size()); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); @@ -115,11 +118,12 @@ TEST(Buffer, host_buffer) TEST(Buffer, non_owning_host_buffer) { + raft::resources handle; auto data = std::vector{1, 2, 3}; std::vector> test_buffers; - test_buffers.emplace_back(data.data(), data.size(), memory_type::host, 0); - test_buffers.emplace_back(data.data(), data.size(), memory_type::host); - test_buffers.emplace_back(data.data(), data.size()); + test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host, 0); + test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); + test_buffers.emplace_back(handle, data.data(), data.size()); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); @@ -133,15 +137,16 @@ TEST(Buffer, non_owning_host_buffer) TEST(Buffer, copy_constructor) { + raft::resources handle; auto data = std::vector{1, 2, 3}; - buffer const orig_buffer = buffer(data.data(), data.size(), memory_type::host); + buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); // host to host copy operations auto test_buffers = std::vector>{}; - test_buffers.emplace_back(orig_buffer); - test_buffers.emplace_back(orig_buffer, memory_type::host); - test_buffers.emplace_back(orig_buffer, memory_type::host, 0); - test_buffers.emplace_back(orig_buffer, memory_type::host, 0, execution_stream{}); + test_buffers.emplace_back(handle, orig_buffer); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host, 0); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host, 0); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); @@ -154,9 +159,9 @@ TEST(Buffer, copy_constructor) #ifndef RAFT_DISABLE_GPU // host to device copy operations auto test_dev_buffers = std::vector>{}; - test_dev_buffers.emplace_back(orig_buffer, memory_type::device); - test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0); - test_dev_buffers.emplace_back(orig_buffer, memory_type::device, 0, execution_stream{}); + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device, 0); + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device, 0); for (auto& dev_buf : test_dev_buffers) { data_out = std::vector(data.size()); RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); @@ -164,9 +169,9 @@ TEST(Buffer, copy_constructor) // device to device copy operations auto test_dev_copies = std::vector>{}; - test_dev_copies.emplace_back(dev_buf, memory_type::device); - test_dev_copies.emplace_back(dev_buf, memory_type::device, 0); - test_dev_copies.emplace_back(dev_buf, memory_type::device, 0, execution_stream{}); + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device, 0); + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device, 0); for (auto& copy_buf : test_dev_copies) { data_out = std::vector(data.size()); RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); @@ -175,9 +180,9 @@ TEST(Buffer, copy_constructor) // device to host copy operations auto test_host_buffers = std::vector>{}; - test_host_buffers.emplace_back(dev_buf, memory_type::host); - test_host_buffers.emplace_back(dev_buf, memory_type::host, 0); - test_host_buffers.emplace_back(dev_buf, memory_type::host, 0, execution_stream{}); + test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); + test_host_buffers.emplace_back(handle, dev_buf, memory_type::host, 0); + test_host_buffers.emplace_back(handle, dev_buf, memory_type::host, 0); for (auto& host_buf : test_host_buffers) { data_out = std::vector(host_buf.data(), host_buf.data() + host_buf.size()); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); @@ -189,12 +194,13 @@ TEST(Buffer, copy_constructor) TEST(Buffer, move_buffer) { + raft::resources handle; auto data = std::vector{1, 2, 3}; auto test_buffers = std::vector>{}; - test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host)); - test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host); - test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0); - test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::host, 0, execution_stream{}); + test_buffers.emplace_back(buffer(handle, data.data(), data.size(), memory_type::host)); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host, 0); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host, 0); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); @@ -206,9 +212,9 @@ TEST(Buffer, move_buffer) } #ifndef RAFT_DISABLE_GPU test_buffers = std::vector>{}; - test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device); - test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0); - test_buffers.emplace_back(buffer(data.data(), data.size(), memory_type::host), memory_type::device, 0, execution_stream{}); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device, 0); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device, 0); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::device); ASSERT_EQ(buf.size(), data.size()); @@ -223,14 +229,15 @@ TEST(Buffer, move_buffer) TEST(Buffer, move_assignment_buffer) { + raft::resources handle; auto data = std::vector{1, 2, 3}; #ifndef RAFT_DISABLE_GPU - auto buf = buffer{data.data(), data.size() - 1, memory_type::device}; + auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::device}; #else - auto buf = buffer{data.data(), data.size() - 1, memory_type::host}; + auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::host}; #endif - buf = buffer{data.size(), memory_type::host}; + buf = buffer{handle, data.size(), memory_type::host}; ASSERT_EQ(buf.mem_type(), memory_type::host); ASSERT_EQ(buf.size(), data.size()); @@ -238,66 +245,68 @@ TEST(Buffer, move_assignment_buffer) TEST(Buffer, partial_buffer_copy) { + raft::resources handle; auto data1 = std::vector{1, 2, 3, 4, 5}; auto data2 = std::vector{0, 0, 0, 0, 0}; auto expected = std::vector{0, 3, 4, 5, 0}; #ifndef RAFT_DISABLE_GPU - auto buf1 = buffer{buffer{data1.data(), data1.size(), memory_type::host}, memory_type::device}; + auto buf1 = buffer{handle, buffer{handle, data1.data(), data1.size(), memory_type::host}, memory_type::device}; #else - auto buf1 = buffer{data1.data(), data1.size(), memory_type::host}; + auto buf1 = buffer{handle, data1.data(), data1.size(), memory_type::host}; #endif - auto buf2 = buffer{data2.data(), data2.size(), memory_type::host}; - copy(buf2, buf1, 1, 2, 3, execution_stream{}); - copy(buf2, buf1, 1, 2, 3, execution_stream{}); - EXPECT_THROW(copy(buf2, buf1, 1, 2, 4, execution_stream{}), out_of_bounds); + auto buf2 = buffer{handle, data2.data(), data2.size(), memory_type::host}; + copy(handle, buf2, buf1, 1, 2, 3); + copy(handle, buf2, buf1, 1, 2, 3); + EXPECT_THROW(copy(handle, buf2, buf1, 1, 2, 4), out_of_bounds); } TEST(Buffer, buffer_copy_overloads) { + raft::resources handle; auto data = std::vector{1, 2, 3}; auto expected = data; - auto orig_host_buffer = buffer(data.data(), data.size(), memory_type::host); - auto orig_dev_buffer = buffer(orig_host_buffer, memory_type::device); - auto copy_dev_buffer = buffer(data.size(), memory_type::device); + auto orig_host_buffer = buffer(handle, data.data(), data.size(), memory_type::host); + auto orig_dev_buffer = buffer(handle, orig_host_buffer, memory_type::device); + auto copy_dev_buffer = buffer(handle, data.size(), memory_type::device); // copying host to host auto data_out = std::vector(data.size()); - auto copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); - copy(copy_host_buffer, orig_host_buffer); + auto copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); + copy(handle, copy_host_buffer, orig_host_buffer); EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // copying host to host with stream data_out = std::vector(data.size()); - copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); - copy(copy_host_buffer, orig_host_buffer, execution_stream{}); + copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); + copy(handle, copy_host_buffer, orig_host_buffer); EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // copying host to host with offset data_out = std::vector(data.size() + 1); - copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); - copy(copy_host_buffer, orig_host_buffer, 2, 1, 1, execution_stream{}); + copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); + copy(handle, copy_host_buffer, orig_host_buffer, 2, 1, 1); expected = std::vector{0, 0, 2, 0}; EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); #ifndef RAFT_DISABLE_GPU // copy device to host data_out = std::vector(data.size()); - copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); - copy(copy_host_buffer, orig_dev_buffer); + copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); + copy(handle, copy_host_buffer, orig_dev_buffer); expected = data; EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // copy device to host with stream data_out = std::vector(data.size()); - copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); - copy(copy_host_buffer, orig_dev_buffer, execution_stream{}); + copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); + copy(handle, copy_host_buffer, orig_dev_buffer); expected = data; EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // copy device to host with offset data_out = std::vector(data.size() + 1); - copy_host_buffer = buffer(data_out.data(), data.size(), memory_type::host); - copy(copy_host_buffer, orig_dev_buffer, 2, 1, 1, execution_stream{}); + copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); + copy(handle, copy_host_buffer, orig_dev_buffer, 2, 1, 1); expected = std::vector{0, 0, 2, 0}; EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); #endif diff --git a/cpp/test/core/buffer.cu b/cpp/test/core/buffer.cu index 5881dfffc8..42b1f1c224 100644 --- a/cpp/test/core/buffer.cu +++ b/cpp/test/core/buffer.cu @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -41,12 +41,12 @@ TEST(Buffer, device_buffer_access) { auto data = std::vector{1, 2, 3}; auto expected = std::vector{4, 5, 6}; + raft::resources handle; auto buf = buffer( - buffer(data.data(), data.size(), memory_type::host), + handle, + buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device, - 0, - execution_stream{} - ); + 0); // check_buffer_access<<<1,1>>>(buf.data()); // auto data_out = std::vector(expected.size()); // auto host_buf = buffer(data_out.data(), data_out.size(), memory_type::host); From 008bb5b2a49e707856c5012ce925537576a7b169 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 28 Apr 2023 14:58:34 -0700 Subject: [PATCH 16/75] move exception --- cpp/include/raft/core/buffer.hpp | 14 +++--- cpp/include/raft/core/error.hpp | 24 ++++++++++ cpp/include/raft/core/exceptions.hpp | 67 ---------------------------- cpp/test/core/buffer.cpp | 2 +- 4 files changed, 30 insertions(+), 77 deletions(-) delete mode 100644 cpp/include/raft/core/exceptions.hpp diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index ecdae194e1..f159651e1a 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -197,7 +197,7 @@ struct buffer { * @brief Move from existing buffer unless a copy is necessary based on * memory location */ - buffer(raft::resources const& handle, buffer&& other, memory_type mem_type, int device) + buffer(raft::resources const& handle, buffer&& other, memory_type mem_type, int device = 0) : device_{[mem_type, &device]() { auto result = execution_device_id_variant{}; if (is_device_accessible(mem_type)) { @@ -240,12 +240,7 @@ struct buffer { return result; }()} { - RAFT_LOG_INFO("original move called"); - } - buffer(raft::resources const& handle, buffer&& other, device_type mem_type, int device=0) - : buffer{handle, std::move(other), mem_type, device} - { - RAFT_LOG_INFO("move constructor without stream called"); + RAFT_LOG_INFO("main move called"); } // buffer(buffer&& other, device_type mem_type) // : buffer{std::move(other), mem_type, 0, execution_stream{}} @@ -284,6 +279,7 @@ struct buffer { RAFT_LOG_INFO("trivial move called"); } buffer& operator=(buffer&& other) noexcept { + RAFT_LOG_INFO("operator= move called"); data_ = std::move(other.data_); device_ = std::move(other.device_); size_ = std::move(other.size_); @@ -301,7 +297,7 @@ struct buffer { case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; } - RAFT_LOG_INFO("data %p; cached_ptr %p\n", result, cached_ptr); + RAFT_LOG_INFO("data() called: data %p; cached_ptr %p\n", result, cached_ptr); return result;} auto device() const noexcept { return device_; } diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 84b244f4dc..797464672e 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -102,6 +102,30 @@ struct logic_error : public raft::exception { * @} */ +struct bad_cuda_call : logic_error { + bad_cuda_call() : bad_cuda_call("CUDA API call failed") {} + explicit bad_cuda_call(char const* msg) : logic_error(msg) {} +}; + +struct out_of_bounds : logic_error { + out_of_bounds() : out_of_bounds("Attempted out-of-bounds memory access") {} + explicit out_of_bounds(char const* msg) : logic_error(msg) {} +}; + +struct wrong_device_type : logic_error { + wrong_device_type() : wrong_device_type("Attempted to use host data on GPU or device data on CPU") {} + explicit wrong_device_type(char const* msg) : logic_error(msg) {} +}; + +struct mem_type_mismatch : logic_error { + mem_type_mismatch() : mem_type_mismatch("Memory type does not match expected type") {} + explicit mem_type_mismatch(char const* msg) : logic_error(msg) {} +}; + +struct wrong_device : logic_error { + wrong_device() : wrong_device("Attempted to use incorrect device") {} + explicit wrong_device(char const* msg) : logic_error(msg) {} +}; } // namespace raft // FIXME: Need to be replaced with RAFT_FAIL diff --git a/cpp/include/raft/core/exceptions.hpp b/cpp/include/raft/core/exceptions.hpp deleted file mode 100644 index bdd5e03856..0000000000 --- a/cpp/include/raft/core/exceptions.hpp +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include - -namespace raft { -struct bad_cuda_call : raft::exception { - bad_cuda_call() : bad_cuda_call("CUDA API call failed") {} - bad_cuda_call(char const* msg) : msg_{msg} {} - virtual char const* what() const noexcept { return msg_; } - - private: - char const* msg_; -}; - -struct out_of_bounds : raft::exception { - out_of_bounds() : out_of_bounds("Attempted out-of-bounds memory access") {} - out_of_bounds(char const* msg) : msg_{msg} {} - virtual char const* what() const noexcept { return msg_; } - - private: - char const* msg_; -}; - -struct wrong_device_type : raft::exception { - wrong_device_type() : wrong_device_type("Attempted to use host data on GPU or device data on CPU") - { - } - wrong_device_type(char const* msg) : msg_{msg} {} - virtual char const* what() const noexcept { return msg_; } - - private: - char const* msg_; -}; - -struct mem_type_mismatch : raft::exception { - mem_type_mismatch() : mem_type_mismatch("Memory type does not match expected type") {} - mem_type_mismatch(char const* msg) : msg_{msg} {} - virtual char const* what() const noexcept { return msg_; } - - private: - char const* msg_; -}; - -struct wrong_device : raft::exception { - wrong_device() : wrong_device("Attempted to use incorrect device") {} - wrong_device(char const* msg) : msg_{msg} {} - virtual char const* what() const noexcept { return msg_; } - - private: - char const* msg_; -}; - -} // namespace raft \ No newline at end of file diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index a277a9b3ea..5795b12115 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include namespace raft { From 5b97273910f590bec11261129f0f9baef273c896 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 2 May 2023 16:58:34 -0700 Subject: [PATCH 17/75] Updates after PR Reviews --- cpp/include/raft/core/buffer.hpp | 123 ++++++------------ .../buffer_utils/owning_buffer_base.hpp | 3 +- .../detail/buffer_utils/owning_buffer_cpu.hpp | 1 - .../detail/buffer_utils/owning_buffer_gpu.hpp | 6 +- .../raft/core/detail/device_setter_base.hpp | 30 ----- .../raft/core/detail/device_setter_gpu.hpp | 47 ------- .../core/detail/execution_device_id_base.hpp | 29 ----- .../core/detail/execution_device_id_cpu.hpp | 34 ----- .../core/detail/execution_device_id_gpu.hpp | 50 ------- cpp/include/raft/core/device_setter.hpp | 27 ---- cpp/include/raft/core/device_support.hpp | 6 - cpp/include/raft/core/error.hpp | 5 + cpp/include/raft/core/execution_device_id.hpp | 32 ----- cpp/test/core/buffer.cpp | 75 ++++++----- cpp/test/core/buffer.cu | 3 +- 15 files changed, 82 insertions(+), 389 deletions(-) delete mode 100644 cpp/include/raft/core/detail/device_setter_base.hpp delete mode 100644 cpp/include/raft/core/detail/device_setter_gpu.hpp delete mode 100644 cpp/include/raft/core/detail/execution_device_id_base.hpp delete mode 100644 cpp/include/raft/core/detail/execution_device_id_cpu.hpp delete mode 100644 cpp/include/raft/core/detail/execution_device_id_gpu.hpp delete mode 100644 cpp/include/raft/core/device_setter.hpp delete mode 100644 cpp/include/raft/core/execution_device_id.hpp diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index f159651e1a..0d49237d2d 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -44,26 +43,19 @@ struct buffer { detail::owning_buffer, detail::owning_buffer>; - buffer() : device_{}, data_{}, size_{}, memory_type_{memory_type::host} {} + buffer() : device_type_{}, data_{}, size_{}, memory_type_{memory_type::host} {} /** Construct non-initialized owning buffer */ buffer(raft::resources const& handle, size_t size, - memory_type mem_type = memory_type::host, - int device = 0) - : device_{[mem_type, &device]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; + memory_type mem_type = memory_type::host) + : device_type_{[mem_type]() { + return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, data_{[this, mem_type, size, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{handle, std::get<1>(device_), size}; + result = detail::owning_buffer{handle, size}; } else { result = detail::owning_buffer{size}; } @@ -83,16 +75,10 @@ struct buffer { } /** Construct non-owning buffer */ - buffer(raft::resources const& handle, T* input_data, size_t size, memory_type mem_type = memory_type::host, int device = 0) - : device_{[mem_type, &device]() { + buffer(raft::resources const& handle, T* input_data, size_t size, memory_type mem_type = memory_type::host) + : device_type_{[mem_type]() { RAFT_LOG_INFO("Non owning constructor call started"); - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; + return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, data_{[this, input_data, mem_type]() { auto result = data_store{}; @@ -126,33 +112,26 @@ struct buffer { */ buffer(raft::resources const& handle, buffer const& other, - memory_type mem_type, - int device = 0) - : device_{[mem_type, &device]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; + memory_type mem_type) + : device_type_{[mem_type]() { + return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, - data_{[this, &other, mem_type, device, handle]() { + data_{[this, &other, mem_type, handle]() { auto result = data_store{}; auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = - detail::owning_buffer(handle, std::get<1>(device_), other.size()); + detail::owning_buffer(handle, other.size()); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("gpu copy called"); - detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::gpu, other.dev_type()); + detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("cpu copy called"); - detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::cpu, other.dev_type()); + detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); } return result; }()}, @@ -167,13 +146,13 @@ struct buffer { return result; }()} { - RAFT_LOG_INFO("Pointer to other's data %p\n", other.data()); + RAFT_LOG_INFO("Pointer to other's data %p\n", other.data_handle()); } friend void swap(buffer& first, buffer& second) { using std::swap; - swap(first.device_, second.device_); + swap(first.device_type_, second.device_type_); swap(first.data_, second.data_); swap(first.size_, second.size_); swap(first.memory_type_, second.memory_type_); @@ -189,7 +168,7 @@ struct buffer { * @brief Create owning copy of existing buffer with given stream * The device type of this new buffer will be the same as the original */ - buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type(), other.device_index()) + buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type()) { } @@ -197,32 +176,26 @@ struct buffer { * @brief Move from existing buffer unless a copy is necessary based on * memory location */ - buffer(raft::resources const& handle, buffer&& other, memory_type mem_type, int device = 0) - : device_{[mem_type, &device]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(mem_type)) { - result = execution_device_id{device}; - } else { - result = execution_device_id{device}; - } - return result; + buffer(raft::resources const& handle, buffer&& other, memory_type mem_type) + : device_type_{[mem_type]() { + return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, - data_{[&other, mem_type, device, handle]() { + data_{[&other, mem_type, handle]() { auto result = data_store{}; - if (mem_type == other.mem_type() && device == other.device_index()) { + if (mem_type == other.mem_type()) { result = std::move(other.data_); } else { auto* result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { - auto buf = detail::owning_buffer{handle, device, other.size()}; + auto buf = detail::owning_buffer{handle, other.size()}; result_data = buf.get(); result = std::move(buf); - detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::gpu, other.dev_type()); + detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { auto buf = detail::owning_buffer{other.size()}; result_data = buf.get(); result = std::move(buf); - detail::buffer_copy(handle, result_data, other.data(), other.size(), device_type::cpu, other.dev_type()); + detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); } } return result; @@ -249,14 +222,8 @@ struct buffer { // } buffer(buffer&& other) noexcept - : device_{[&other]() { - auto result = execution_device_id_variant{}; - if (is_device_accessible(other.mem_type())) { - result = execution_device_id{other.device_index()}; - } else { - result = execution_device_id{other.device_index()}; - } - return result; + : device_type_{[&other]() { + return is_device_accessible(other.mem_type()) ? device_type::gpu : device_type::cpu; }()}, data_{[&other]() { auto result = data_store{}; @@ -281,7 +248,7 @@ struct buffer { buffer& operator=(buffer&& other) noexcept { RAFT_LOG_INFO("operator= move called"); data_ = std::move(other.data_); - device_ = std::move(other.device_); + device_type_ = std::move(other.device_type_); size_ = std::move(other.size_); memory_type_ = std::move(other.memory_type_); cached_ptr = std::move(other.cached_ptr); @@ -289,7 +256,7 @@ struct buffer { } auto size() const noexcept { return size_; } - HOST DEVICE auto* data() const noexcept { + HOST DEVICE auto* data_handle() const noexcept { auto result = static_cast(nullptr); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; @@ -297,20 +264,10 @@ struct buffer { case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; } - RAFT_LOG_INFO("data() called: data %p; cached_ptr %p\n", result, cached_ptr); + RAFT_LOG_INFO("data_handle() called: data %p; cached_ptr %p\n", result, cached_ptr); return result;} - auto device() const noexcept { return device_; } - - auto device_index() const noexcept - { - auto result = int{}; - switch (device_.index()) { - case 0: result = std::get<0>(device_).value(); break; - case 1: result = std::get<1>(device_).value(); break; - } - return result; - } + auto device() const noexcept { return device_type_; } auto mem_type() const noexcept { @@ -322,16 +279,10 @@ struct buffer { private: auto dev_type() const noexcept { - enum device_type result; - if (device_.index() == 0) { - result = device_type::cpu; - } else { - result = device_type::gpu; - } - return result; + return device_type_; } - execution_device_id_variant device_; + enum device_type device_type_; data_store data_; size_t size_; enum memory_type memory_type_; @@ -352,10 +303,10 @@ detail::const_agnostic_same_t copy(raft::resources const& handle, } } auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; - auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; + auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; detail::buffer_copy(handle, - dst.data() + dst_offset, - src.data() + src_offset, + dst.data_handle() + dst_offset, + src.data_handle() + src_offset, size, dst_device_type, src_device_type); diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp index 61cee5aa6e..fd59f871cd 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -15,7 +15,6 @@ */ #pragma once #include -#include #include #include @@ -25,7 +24,7 @@ namespace detail { template struct owning_buffer { owning_buffer() {} - owning_buffer(raft::resources const& handle, execution_device_id device_id, std::size_t size) {} + owning_buffer(raft::resources const& handle, std::size_t size) {} auto* get() const { return static_cast(nullptr); } }; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index dad4cb2da2..8d45bda7e8 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -17,7 +17,6 @@ #include "owning_buffer_base.hpp" #include #include -#include #include namespace raft { diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 81a0f611bf..79b8b25311 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -16,9 +16,7 @@ #pragma once #include "owning_buffer_base.hpp" #include -#include #include -#include #include #include @@ -30,10 +28,8 @@ struct owning_buffer { owning_buffer() : data_{} {} owning_buffer(raft::resources const& handle, - execution_device_id execution_device_id, std::size_t size) noexcept(false) - : data_{[&execution_device_id, &size, handle]() { - auto device_context = device_setter{execution_device_id}; + : data_{[&size, handle]() { return rmm::device_buffer{size * sizeof(value_type), raft::resource::get_cuda_stream(handle)}; }()} { diff --git a/cpp/include/raft/core/detail/device_setter_base.hpp b/cpp/include/raft/core/detail/device_setter_base.hpp deleted file mode 100644 index b3b84f3613..0000000000 --- a/cpp/include/raft/core/detail/device_setter_base.hpp +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include - -namespace raft { -namespace detail { - -/** Struct for setting current device within a code block */ -template -struct device_setter { - device_setter(execution_device_id device) {} -}; - -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/device_setter_gpu.hpp b/cpp/include/raft/core/detail/device_setter_gpu.hpp deleted file mode 100644 index 98cb682de6..0000000000 --- a/cpp/include/raft/core/detail/device_setter_gpu.hpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace detail { - -/** Class for setting current device within a code block */ -template <> -struct device_setter { - device_setter(raft::execution_device_id device) noexcept(false) - : prev_device_{[]() { - auto result = int{}; - RAFT_CUDA_TRY(cudaGetDevice(&result)); - return result; - }()} - { - RAFT_CUDA_TRY(cudaSetDevice(device.value())); - } - - ~device_setter() { RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); } - - private: - execution_device_id prev_device_; -}; - -} // namespace detail -} // namespace raft diff --git a/cpp/include/raft/core/detail/execution_device_id_base.hpp b/cpp/include/raft/core/detail/execution_device_id_base.hpp deleted file mode 100644 index fd417d44f1..0000000000 --- a/cpp/include/raft/core/detail/execution_device_id_base.hpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include - -namespace raft { -namespace detail { -template -struct execution_device_id { - using value_type = int; - - execution_device_id(value_type device_index) {} - auto value() const { return value_type{}; } -}; -} // namespace detail -} // namespace raft diff --git a/cpp/include/raft/core/detail/execution_device_id_cpu.hpp b/cpp/include/raft/core/detail/execution_device_id_cpu.hpp deleted file mode 100644 index 56b52a6e4c..0000000000 --- a/cpp/include/raft/core/detail/execution_device_id_cpu.hpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "execution_device_id_base.hpp" -#include - -namespace raft { -namespace detail { -template <> -struct execution_device_id { - using value_type = int; - execution_device_id() : id_{value_type{}} {}; - execution_device_id(value_type dev_id) : id_{dev_id} {}; - - auto value() const noexcept { return id_; } - - private: - value_type id_; -}; -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/execution_device_id_gpu.hpp b/cpp/include/raft/core/detail/execution_device_id_gpu.hpp deleted file mode 100644 index a039c8ee02..0000000000 --- a/cpp/include/raft/core/detail/execution_device_id_gpu.hpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "execution_device_id_base.hpp" -#include -#include -#include - -namespace raft { -namespace detail { -template <> -struct execution_device_id { - using value_type = typename rmm::cuda_device_id::value_type; - execution_device_id() noexcept(false) - : id_{[]() { - auto raw_id = value_type{}; - RAFT_CUDA_TRY(cudaGetDevice(&raw_id)); - return raw_id; - }()} {}; - /* We do not mark this constructor as explicit to allow public API - * functions to accept `device_id` arguments without requiring - * downstream consumers to explicitly construct a device_id. Thus, - * consumers can use the type they expect to use when specifying a device - * (int), but once we are inside the public API, the device type remains - * attached to this value and we can easily convert to the strongly-typed - * rmm::cuda_device_id if desired. - */ - execution_device_id(value_type dev_id) noexcept : id_{dev_id} {}; - - auto value() const noexcept { return id_.value(); } - auto rmm_id() const noexcept { return id_; } - - private: - rmm::cuda_device_id id_; -}; -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/device_setter.hpp b/cpp/include/raft/core/device_setter.hpp deleted file mode 100644 index 23c9c91767..0000000000 --- a/cpp/include/raft/core/device_setter.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#ifndef RAFT_DISABLE_GPU -#include -#endif -#include - -namespace raft { - -using device_setter = detail::device_setter; - -} \ No newline at end of file diff --git a/cpp/include/raft/core/device_support.hpp b/cpp/include/raft/core/device_support.hpp index c0fe74b33d..c27fd12c5f 100644 --- a/cpp/include/raft/core/device_support.hpp +++ b/cpp/include/raft/core/device_support.hpp @@ -41,10 +41,4 @@ auto constexpr static const DEBUG_ENABLED = false; #else auto constexpr static const DEBUG_ENABLED = true; #endif - -struct cuda_unsupported : raft::exception { - explicit cuda_unsupported(std::string const& msg) : raft::exception{msg} {} - cuda_unsupported() : cuda_unsupported{"CUDA functionality invoked in non-CUDA build"} {} -}; - } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 797464672e..73f4813841 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -107,6 +107,11 @@ struct bad_cuda_call : logic_error { explicit bad_cuda_call(char const* msg) : logic_error(msg) {} }; +struct cuda_unsupported : logic_error { + cuda_unsupported() : cuda_unsupported("CUDA functionality invoked in non-CUDA build") {} + explicit cuda_unsupported(char const* msg) : logic_error(msg) {} +}; + struct out_of_bounds : logic_error { out_of_bounds() : out_of_bounds("Attempted out-of-bounds memory access") {} explicit out_of_bounds(char const* msg) : logic_error(msg) {} diff --git a/cpp/include/raft/core/execution_device_id.hpp b/cpp/include/raft/core/execution_device_id.hpp deleted file mode 100644 index 3e98fcdbe4..0000000000 --- a/cpp/include/raft/core/execution_device_id.hpp +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#ifndef RAFT_DISABLE_GPU -#include -#endif -#include -#include - -namespace raft { -template -using execution_device_id = detail::execution_device_id; - -using execution_device_id_variant = - std::variant, execution_device_id>; -} // namespace raft diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index 5795b12115..9e34c617e1 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -28,7 +28,6 @@ TEST(Buffer, default_buffer) auto buf = buffer(); EXPECT_EQ(buf.mem_type(), memory_type::host); EXPECT_EQ(buf.size(), 0); - EXPECT_EQ(buf.device_index(), 0); } TEST(Buffer, device_buffer) @@ -36,23 +35,23 @@ TEST(Buffer, device_buffer) raft::resources handle; auto data = std::vector{1, 2, 3}; auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, data.size(), memory_type::device, 0); - test_buffers.emplace_back(handle, data.size(), memory_type::device, 0); + test_buffers.emplace_back(handle, data.size(), memory_type::device); + test_buffers.emplace_back(handle, data.size(), memory_type::device); test_buffers.emplace_back(handle, data.size(), memory_type::device); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::device); ASSERT_EQ(buf.size(), data.size()); #ifndef RAFT_DISABLE_GPU - ASSERT_NE(buf.data(), nullptr); + ASSERT_NE(buf.data_handle(), nullptr); auto data_out = std::vector(data.size()); - cudaMemcpy(static_cast(buf.data()), + cudaMemcpy(static_cast(buf.data_handle()), static_cast(data.data()), sizeof(int) * data.size(), cudaMemcpyHostToDevice); cudaMemcpy(static_cast(data_out.data()), - static_cast(buf.data()), + static_cast(buf.data_handle()), sizeof(int) * data.size(), cudaMemcpyDeviceToHost); EXPECT_THAT(data_out, testing::ElementsAreArray(data)); @@ -73,18 +72,18 @@ TEST(Buffer, non_owning_device_buffer) cudaMemcpyHostToDevice); #endif auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device, 0); + test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); #ifndef RAFT_DISABLE_GPU for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::device); ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data(), ptr_d); + ASSERT_EQ(buf.data_handle(), ptr_d); auto data_out = std::vector(data.size()); cudaMemcpy(static_cast(data_out.data()), - static_cast(buf.data()), + static_cast(buf.data_handle()), sizeof(int) * data.size(), cudaMemcpyDeviceToHost); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); @@ -98,20 +97,20 @@ TEST(Buffer, host_buffer) raft::resources handle; auto data = std::vector{1, 2, 3}; auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, data.size(), memory_type::host, 0); - test_buffers.emplace_back(handle, data.size(), memory_type::host, 0); + test_buffers.emplace_back(handle, data.size(), memory_type::host); + test_buffers.emplace_back(handle, data.size(), memory_type::host); test_buffers.emplace_back(handle, data.size(), memory_type::host); test_buffers.emplace_back(handle, data.size()); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data(), nullptr); + ASSERT_NE(buf.data_handle(), nullptr); std::memcpy( - static_cast(buf.data()), static_cast(data.data()), data.size() * sizeof(int)); + static_cast(buf.data_handle()), static_cast(data.data()), data.size() * sizeof(int)); - auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } } @@ -121,16 +120,16 @@ TEST(Buffer, non_owning_host_buffer) raft::resources handle; auto data = std::vector{1, 2, 3}; std::vector> test_buffers; - test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host, 0); + test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); test_buffers.emplace_back(handle, data.data(), data.size()); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data(), data.data()); + ASSERT_EQ(buf.data_handle(), data.data()); - auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } } @@ -145,46 +144,46 @@ TEST(Buffer, copy_constructor) auto test_buffers = std::vector>{}; test_buffers.emplace_back(handle, orig_buffer); test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host, 0); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host, 0); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data(), orig_buffer.data()); + ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); - auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); #ifndef RAFT_DISABLE_GPU // host to device copy operations auto test_dev_buffers = std::vector>{}; test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device, 0); - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device, 0); + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); for (auto& dev_buf : test_dev_buffers) { data_out = std::vector(data.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); // device to device copy operations auto test_dev_copies = std::vector>{}; test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device, 0); - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device, 0); + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); for (auto& copy_buf : test_dev_copies) { data_out = std::vector(data.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } // device to host copy operations auto test_host_buffers = std::vector>{}; test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); - test_host_buffers.emplace_back(handle, dev_buf, memory_type::host, 0); - test_host_buffers.emplace_back(handle, dev_buf, memory_type::host, 0); + test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); + test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); for (auto& host_buf : test_host_buffers) { - data_out = std::vector(host_buf.data(), host_buf.data() + host_buf.size()); + data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } } @@ -199,29 +198,29 @@ TEST(Buffer, move_buffer) auto test_buffers = std::vector>{}; test_buffers.emplace_back(buffer(handle, data.data(), data.size(), memory_type::host)); test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host, 0); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host, 0); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data(), data.data()); + ASSERT_EQ(buf.data_handle(), data.data()); - auto data_out = std::vector(buf.data(), buf.data() + buf.size()); + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } #ifndef RAFT_DISABLE_GPU test_buffers = std::vector>{}; test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device, 0); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device, 0); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); + test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::device); ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data(), data.data()); + ASSERT_NE(buf.data_handle(), data.data()); auto data_out = std::vector(buf.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data()), buf.size() * sizeof(int), cudaMemcpyDefault)); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data_handle()), buf.size() * sizeof(int), cudaMemcpyDefault)); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } #endif diff --git a/cpp/test/core/buffer.cu b/cpp/test/core/buffer.cu index 42b1f1c224..d7b308b4df 100644 --- a/cpp/test/core/buffer.cu +++ b/cpp/test/core/buffer.cu @@ -45,8 +45,7 @@ TEST(Buffer, device_buffer_access) auto buf = buffer( handle, buffer(handle, data.data(), data.size(), memory_type::host), - memory_type::device, - 0); + memory_type::device); // check_buffer_access<<<1,1>>>(buf.data()); // auto data_out = std::vector(expected.size()); // auto host_buf = buffer(data_out.data(), data_out.size(), memory_type::host); From 838bfef52fd666fa16150a7d4de191148efbf75f Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 8 May 2023 12:43:34 -0700 Subject: [PATCH 18/75] Add container policy --- cpp/include/raft/core/buffer.hpp | 56 ++++++++++++------- .../detail/buffer_utils/non_owning_buffer.hpp | 19 +++++-- .../buffer_utils/owning_buffer_base.hpp | 10 +++- .../detail/buffer_utils/owning_buffer_cpu.hpp | 34 +++++++---- .../detail/buffer_utils/owning_buffer_gpu.hpp | 33 ++++++++--- 5 files changed, 105 insertions(+), 47 deletions(-) diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index 0d49237d2d..930a32e42f 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -36,18 +36,22 @@ namespace raft { * @brief A container which may or may not own its own data on host or device * */ -template +template + typename ContainerPolicy, + typename IndexType = std::uint32_t> struct buffer { - using data_store = std::variant, - detail::non_owning_buffer, - detail::owning_buffer, - detail::owning_buffer>; + using data_store = std::variant, + detail::non_owning_buffer, + detail::owning_buffer, + detail::owning_buffer>; buffer() : device_type_{}, data_{}, size_{}, memory_type_{memory_type::host} {} /** Construct non-initialized owning buffer */ buffer(raft::resources const& handle, - size_t size, + IndexType size, memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; @@ -55,16 +59,16 @@ struct buffer { data_{[this, mem_type, size, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{handle, size}; + result = detail::owning_buffer{handle, size}; } else { - result = detail::owning_buffer{size}; + result = detail::owning_buffer{handle, size}; } return result; }()}, size_{size}, memory_type_{mem_type}, cached_ptr{[this]() { - auto result = static_cast(nullptr); + auto result = static_cast(nullptr); switch (data_.index()) { case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; @@ -74,8 +78,10 @@ struct buffer { { } - /** Construct non-owning buffer */ - buffer(raft::resources const& handle, T* input_data, size_t size, memory_type mem_type = memory_type::host) + /** Construct non-owning buffer. Currently, users must ensure that the input_data is on the same device_type as the requested mem_type. + This cannot be asserted because checking the device id requires cuda headers (which is against the intended cuda-free build). If + the mem_type is different from the device_type of input_data, the input_data should first be copied to the appropriate location. */ + buffer(raft::resources const& handle, ElementType* input_data, IndexType size, memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { RAFT_LOG_INFO("Non owning constructor call started"); return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; @@ -83,16 +89,16 @@ struct buffer { data_{[this, input_data, mem_type]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data}; + result = detail::non_owning_buffer{input_data}; } else { - result = detail::non_owning_buffer{input_data}; + result = detail::non_owning_buffer{input_data}; } return result; }()}, size_{size}, memory_type_{mem_type}, cached_ptr{[this]() { - auto result = static_cast(nullptr); + auto result = static_cast(nullptr); RAFT_LOG_INFO("data_index from constructor %d\n", data_.index()); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; @@ -106,19 +112,19 @@ struct buffer { } /** - * @brief Construct one buffer from another of the given memory type + * @brief Construct one buffer of the given memory type from another. * A buffer constructed in this way is owning and will copy the data from - * the original location + * the original location. */ buffer(raft::resources const& handle, - buffer const& other, + buffer const& other, memory_type mem_type) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, data_{[this, &other, mem_type, handle]() { auto result = data_store{}; - auto result_data = static_cast(nullptr); + auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = detail::owning_buffer(handle, other.size()); @@ -188,6 +194,11 @@ struct buffer { auto* result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = detail::owning_buffer{handle, other.size()}; + auto buf = detail::owning_buffer typename ContainerPolicy> result_data = buf.get(); result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); @@ -276,6 +287,10 @@ struct buffer { ~buffer() = default; + auto view() -> view_type { + return make_mdspan mem_type()), is_device_accessible(this -> mem_type())>(data_, make_extents(size_)); + } + private: auto dev_type() const noexcept { @@ -284,9 +299,10 @@ struct buffer { enum device_type device_type_; data_store data_; - size_t size_; + IndexType size_; enum memory_type memory_type_; - T* cached_ptr; + ElementType* cached_ptr; + int device_id_; }; template diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index 4ddb294abe..4afc385b75 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -15,25 +15,34 @@ */ #pragma once #include "raft/core/logger.hpp" +#include "raft/core/mdspan.hpp" #include #include +#include +#include namespace raft { namespace detail { -template +template typename ContainerPolicy> struct non_owning_buffer { - using value_type = std::remove_const_t; + using element_type = std::remove_cv_t; + using index_type = typename Extents::index_type; + using container_policy = ContainerPolicy; + non_owning_buffer() : data_{nullptr} {} - non_owning_buffer(T* ptr) : data_{ptr} { + non_owning_buffer(ElementType* ptr) : data_{ptr} { } - auto* get() const { return data_; } private: // TODO(wphicks): Back this with RMM-allocated host memory - T* data_; + ElementType* data_; }; } // namespace detail } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp index fd59f871cd..9766ebc0c9 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -21,11 +21,15 @@ namespace raft { namespace detail { -template +template typename ContainerPolicy> struct owning_buffer { owning_buffer() {} - owning_buffer(raft::resources const& handle, std::size_t size) {} - auto* get() const { return static_cast(nullptr); } + owning_buffer(raft::resources const& handle, Extents extents) {} + auto* get() const { return static_cast(nullptr); } }; } // namespace detail diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index 8d45bda7e8..91166e30af 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -15,26 +15,40 @@ */ #pragma once #include "owning_buffer_base.hpp" +#include #include #include +#include #include namespace raft { namespace detail { -template -struct owning_buffer { - // TODO(wphicks): Assess need for buffers of const T - using value_type = std::remove_const_t; +template + typename ContainerPolicy = host_vector_policy> +struct owning_buffer { + using element_type = std::remove_cv_t; + using index_type = typename Extents::index_type; + using container_policy = ContainerPolicy; + using owning_host_buffer = host_mdarray; + owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) + : extents_{extents}, data_{[&extents, handle]() { + // return rmm::device_buffer{size * sizeof(value_type), raft::resource::get_cuda_stream(handle)}; + typename owning_host_buffer::mapping_type layout{extents}; + typename owning_host_buffer::container_policy_type policy{}; + return owning_host_buffer{handle, layout, policy}; + }()} + { + } - owning_buffer() : data_{std::unique_ptr{nullptr}} {} - - owning_buffer(std::size_t size) : data_{std::make_unique(size)} {} - - auto* get() const { return data_.get(); } + auto* get() const { return reinterpret_cast(data_.data_handle()); } private: // TODO(wphicks): Back this with RMM-allocated host memory - std::unique_ptr data_; + Extents extents_; + owning_host_buffer data_; }; } // namespace detail } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 79b8b25311..2a1c0d8255 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -15,30 +15,45 @@ */ #pragma once #include "owning_buffer_base.hpp" +#include +#include "raft/core/mdspan_types.hpp" +#include #include #include #include #include +#include namespace raft { namespace detail { -template -struct owning_buffer { - using value_type = std::remove_const_t; +template + typename ContainerPolicy = device_uvector_policy> +struct owning_buffer { + using element_type = std::remove_cv_t; + using index_type = typename Extents::index_type; + using container_policy = ContainerPolicy; + using owning_device_buffer = device_mdarray; + owning_buffer() : data_{} {} - owning_buffer(raft::resources const& handle, - std::size_t size) noexcept(false) - : data_{[&size, handle]() { - return rmm::device_buffer{size * sizeof(value_type), raft::resource::get_cuda_stream(handle)}; + owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) + : extents_{extents}, data_{[&extents, handle]() { + // return rmm::device_buffer{size * sizeof(value_type), raft::resource::get_cuda_stream(handle)}; + typename owning_device_buffer::mapping_type layout{extents}; + typename owning_device_buffer::container_policy_type policy{}; + return owning_device_buffer{handle, layout, policy}; }()} { } - auto* get() const { return reinterpret_cast(data_.data()); } + auto* get() const { return reinterpret_cast(data_.data_handle()); } private: - mutable rmm::device_buffer data_; + Extents extents_; + owning_device_buffer data_; }; } // namespace detail } // namespace raft \ No newline at end of file From e035e2e7b5d13b9709e5604b5ecfb88d096859c5 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 10 May 2023 08:36:39 -0700 Subject: [PATCH 19/75] further changes with container policy --- cpp/include/raft/core/buffer.hpp | 89 +++++----- .../detail/buffer_utils/non_owning_buffer.hpp | 15 +- .../buffer_utils/owning_buffer_base.hpp | 4 +- .../detail/buffer_utils/owning_buffer_cpu.hpp | 15 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 16 +- cpp/test/core/buffer.cpp | 167 +++++++++--------- 6 files changed, 145 insertions(+), 161 deletions(-) diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index 930a32e42f..bfa4fafa06 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -16,6 +16,7 @@ #pragma once #include "raft/core/logger.hpp" #include +#include #include #include #include @@ -35,23 +36,26 @@ namespace raft { /** * @brief A container which may or may not own its own data on host or device * + * @tparam ElementType type of the input + * @tparam LayoutPolicy layout of the input + * @tparam ContainerPolicy container to be used to own host/device memory if needed. Users must ensure that the container has the correct type (host/device). Exceptions due to a device container being used for a host buffer and vice versa are not caught by the buffer class. + * @tparam the index type of the extents */ -template - typename ContainerPolicy, - typename IndexType = std::uint32_t> +template struct buffer { - using data_store = std::variant, - detail::non_owning_buffer, - detail::owning_buffer, - detail::owning_buffer>; + using buffer_extent = vector_extent; + using data_store = std::variant, + detail::non_owning_buffer, + detail::owning_buffer, + detail::owning_buffer>; buffer() : device_type_{}, data_{}, size_{}, memory_type_{memory_type::host} {} - /** Construct non-initialized owning buffer */ + /** Construct non-initialized owning buffer. For owning buffers, managed memory is treated as + * device memory only. Therefore, users are discouraged from using managed memory for creating + * owning buffers. */ buffer(raft::resources const& handle, - IndexType size, + size_t size, memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; @@ -59,9 +63,9 @@ struct buffer { data_{[this, mem_type, size, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{handle, size}; + result = detail::owning_buffer{handle, size}; } else { - result = detail::owning_buffer{handle, size}; + result = detail::owning_buffer{handle, size}; } return result; }()}, @@ -80,8 +84,9 @@ struct buffer { /** Construct non-owning buffer. Currently, users must ensure that the input_data is on the same device_type as the requested mem_type. This cannot be asserted because checking the device id requires cuda headers (which is against the intended cuda-free build). If - the mem_type is different from the device_type of input_data, the input_data should first be copied to the appropriate location. */ - buffer(raft::resources const& handle, ElementType* input_data, IndexType size, memory_type mem_type = memory_type::host) + the mem_type is different from the device_type of input_data, the input_data should first be copied to the appropriate location. For + managed memory_type, input_data should be a managed pointer. */ + buffer(raft::resources const& handle, ElementType* input_data, size_t size, memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { RAFT_LOG_INFO("Non owning constructor call started"); return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; @@ -89,9 +94,9 @@ struct buffer { data_{[this, input_data, mem_type]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data}; + result = detail::non_owning_buffer{input_data}; } else { - result = detail::non_owning_buffer{input_data}; + result = detail::non_owning_buffer{input_data}; } return result; }()}, @@ -127,13 +132,13 @@ struct buffer { auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = - detail::owning_buffer(handle, other.size()); + detail::owning_buffer(handle, other.size()); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("gpu copy called"); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { - auto buf = detail::owning_buffer(other.size()); + auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("cpu copy called"); @@ -144,7 +149,7 @@ struct buffer { size_{other.size()}, memory_type_{mem_type}, cached_ptr{[this]() { - auto result = static_cast(nullptr); + auto result = static_cast(nullptr); switch (data_.index()) { case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; @@ -155,7 +160,7 @@ struct buffer { RAFT_LOG_INFO("Pointer to other's data %p\n", other.data_handle()); } - friend void swap(buffer& first, buffer& second) + friend void swap(buffer& first, buffer& second) { using std::swap; swap(first.device_type_, second.device_type_); @@ -164,7 +169,7 @@ struct buffer { swap(first.memory_type_, second.memory_type_); swap(first.cached_ptr, second.cached_ptr); } - buffer& operator=(buffer const& other) { + buffer& operator=(buffer const& other) { auto copy = other; swap(*this, copy); return *this; @@ -174,7 +179,7 @@ struct buffer { * @brief Create owning copy of existing buffer with given stream * The device type of this new buffer will be the same as the original */ - buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type()) + buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type()) { } @@ -182,7 +187,7 @@ struct buffer { * @brief Move from existing buffer unless a copy is necessary based on * memory location */ - buffer(raft::resources const& handle, buffer&& other, memory_type mem_type) + buffer(raft::resources const& handle, buffer&& other, memory_type mem_type) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, @@ -191,19 +196,18 @@ struct buffer { if (mem_type == other.mem_type()) { result = std::move(other.data_); } else { - auto* result_data = static_cast(nullptr); + auto* result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { - auto buf = detail::owning_buffer{handle, other.size()}; - auto buf = detail::owning_buffer typename ContainerPolicy> + buffer_extent>{handle, other.size()}; result_data = buf.get(); result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { - auto buf = detail::owning_buffer{other.size()}; + auto buf = detail::owning_buffer{other.size()}; result_data = buf.get(); result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); @@ -214,7 +218,7 @@ struct buffer { size_{other.size()}, memory_type_{mem_type}, cached_ptr{[this]() { - auto result = static_cast(nullptr); + auto result = static_cast(nullptr); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; @@ -232,7 +236,7 @@ struct buffer { // RAFT_LOG_INFO("copy constructor without stream and device called"); // } - buffer(buffer&& other) noexcept + buffer(buffer&& other) noexcept : device_type_{[&other]() { return is_device_accessible(other.mem_type()) ? device_type::gpu : device_type::cpu; }()}, @@ -244,7 +248,7 @@ struct buffer { size_{other.size()}, memory_type_{other.mem_type()}, cached_ptr{[this]() { - auto result = static_cast(nullptr); + auto result = static_cast(nullptr); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; @@ -256,7 +260,7 @@ struct buffer { { RAFT_LOG_INFO("trivial move called"); } - buffer& operator=(buffer&& other) noexcept { + buffer& operator=(buffer&& other) noexcept { RAFT_LOG_INFO("operator= move called"); data_ = std::move(other.data_); device_type_ = std::move(other.device_type_); @@ -268,7 +272,7 @@ struct buffer { auto size() const noexcept { return size_; } HOST DEVICE auto* data_handle() const noexcept { - auto result = static_cast(nullptr); + auto result = static_cast(nullptr); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; @@ -278,8 +282,6 @@ struct buffer { RAFT_LOG_INFO("data_handle() called: data %p; cached_ptr %p\n", result, cached_ptr); return result;} - auto device() const noexcept { return device_type_; } - auto mem_type() const noexcept { return memory_type_; @@ -287,9 +289,9 @@ struct buffer { ~buffer() = default; - auto view() -> view_type { - return make_mdspan mem_type()), is_device_accessible(this -> mem_type())>(data_, make_extents(size_)); - } + // auto view() -> view_type { + // return make_mdspan mem_type()), is_device_accessible(this -> mem_type())>(data_, make_extents(size_)); + // } private: auto dev_type() const noexcept @@ -299,15 +301,14 @@ struct buffer { enum device_type device_type_; data_store data_; - IndexType size_; + size_t size_; enum memory_type memory_type_; ElementType* cached_ptr; - int device_id_; }; template detail::const_agnostic_same_t copy(raft::resources const& handle, - buffer& dst, + buffer & dst, buffer const& src, size_t dst_offset, size_t src_offset, diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index 4afc385b75..2a8c58fce2 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -14,24 +14,20 @@ * limitations under the License. */ #pragma once +#include "raft/core/host_container_policy.hpp" #include "raft/core/logger.hpp" -#include "raft/core/mdspan.hpp" +// #include "raft/core/mdspan.hpp" #include #include -#include -#include +// #include +// #include namespace raft { namespace detail { template typename ContainerPolicy> + typename Extents> struct non_owning_buffer { - using element_type = std::remove_cv_t; - using index_type = typename Extents::index_type; - using container_policy = ContainerPolicy; non_owning_buffer() : data_{nullptr} {} @@ -44,5 +40,6 @@ struct non_owning_buffer { // TODO(wphicks): Back this with RMM-allocated host memory ElementType* data_; }; + } // namespace detail } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp index 9766ebc0c9..62f6b69195 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -23,9 +23,7 @@ namespace detail { template typename ContainerPolicy> + typename Extents> struct owning_buffer { owning_buffer() {} owning_buffer(raft::resources const& handle, Extents extents) {} diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index 91166e30af..1d3e196fbd 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -15,6 +15,7 @@ */ #pragma once #include "owning_buffer_base.hpp" +#include "raft/core/mdspan.hpp" #include #include #include @@ -24,18 +25,13 @@ namespace raft { namespace detail { template - typename ContainerPolicy = host_vector_policy> -struct owning_buffer { + typename Extents> +struct owning_buffer { using element_type = std::remove_cv_t; - using index_type = typename Extents::index_type; - using container_policy = ContainerPolicy; - using owning_host_buffer = host_mdarray; + using container_policy = host_vector_policy; + using owning_host_buffer = host_mdarray; owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) : extents_{extents}, data_{[&extents, handle]() { - // return rmm::device_buffer{size * sizeof(value_type), raft::resource::get_cuda_stream(handle)}; typename owning_host_buffer::mapping_type layout{extents}; typename owning_host_buffer::container_policy_type policy{}; return owning_host_buffer{handle, layout, policy}; @@ -46,7 +42,6 @@ struct owning_buffer(data_.data_handle()); } private: - // TODO(wphicks): Back this with RMM-allocated host memory Extents extents_; owning_host_buffer data_; }; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 2a1c0d8255..37dcc423aa 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -16,26 +16,18 @@ #pragma once #include "owning_buffer_base.hpp" #include -#include "raft/core/mdspan_types.hpp" #include #include #include -#include -#include -#include namespace raft { namespace detail { template - typename ContainerPolicy = device_uvector_policy> -struct owning_buffer { + typename Extents> +struct owning_buffer { using element_type = std::remove_cv_t; - using index_type = typename Extents::index_type; - using container_policy = ContainerPolicy; - using owning_device_buffer = device_mdarray; + using container_policy = device_uvector_policy; + using owning_device_buffer = device_mdarray; owning_buffer() : data_{} {} diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index 9e34c617e1..d2fa610f17 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -30,34 +31,34 @@ TEST(Buffer, default_buffer) EXPECT_EQ(buf.size(), 0); } -TEST(Buffer, device_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, data.size(), memory_type::device); - test_buffers.emplace_back(handle, data.size(), memory_type::device); - test_buffers.emplace_back(handle, data.size(), memory_type::device); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_EQ(buf.size(), data.size()); -#ifndef RAFT_DISABLE_GPU - ASSERT_NE(buf.data_handle(), nullptr); - - auto data_out = std::vector(data.size()); - cudaMemcpy(static_cast(buf.data_handle()), - static_cast(data.data()), - sizeof(int) * data.size(), - cudaMemcpyHostToDevice); - cudaMemcpy(static_cast(data_out.data()), - static_cast(buf.data_handle()), - sizeof(int) * data.size(), - cudaMemcpyDeviceToHost); - EXPECT_THAT(data_out, testing::ElementsAreArray(data)); -#endif - } -} +// TEST(Buffer, device_buffer) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(handle, data.size(), memory_type::device); +// test_buffers.emplace_back(handle, data.size(), memory_type::device); +// test_buffers.emplace_back(handle, data.size(), memory_type::device); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::device); +// ASSERT_EQ(buf.size(), data.size()); +// #ifndef RAFT_DISABLE_GPU +// ASSERT_NE(buf.data_handle(), nullptr); + +// auto data_out = std::vector(data.size()); +// cudaMemcpy(static_cast(buf.data_handle()), +// static_cast(data.data()), +// sizeof(int) * data.size(), +// cudaMemcpyHostToDevice); +// cudaMemcpy(static_cast(data_out.data()), +// static_cast(buf.data_handle()), +// sizeof(int) * data.size(), +// cudaMemcpyDeviceToHost); +// EXPECT_THAT(data_out, testing::ElementsAreArray(data)); +// #endif +// } +// } TEST(Buffer, non_owning_device_buffer) { @@ -134,62 +135,62 @@ TEST(Buffer, non_owning_host_buffer) } } -TEST(Buffer, copy_constructor) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); - - // host to host copy operations - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, orig_buffer); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - -#ifndef RAFT_DISABLE_GPU - // host to device copy operations - auto test_dev_buffers = std::vector>{}; - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); - for (auto& dev_buf : test_dev_buffers) { - data_out = std::vector(data.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// TEST(Buffer, copy_constructor) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); + +// // host to host copy operations +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(handle, orig_buffer); +// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); +// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); +// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); + +// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + +// #ifndef RAFT_DISABLE_GPU +// // host to device copy operations +// auto test_dev_buffers = std::vector>{}; +// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); +// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); +// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); +// for (auto& dev_buf : test_dev_buffers) { +// data_out = std::vector(data.size()); +// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - // device to device copy operations - auto test_dev_copies = std::vector>{}; - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); - for (auto& copy_buf : test_dev_copies) { - data_out = std::vector(data.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } - - // device to host copy operations - auto test_host_buffers = std::vector>{}; - test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); - test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); - test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); - for (auto& host_buf : test_host_buffers) { - data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } - } -#endif - } -} +// // device to device copy operations +// auto test_dev_copies = std::vector>{}; +// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); +// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); +// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); +// for (auto& copy_buf : test_dev_copies) { +// data_out = std::vector(data.size()); +// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } + +// // device to host copy operations +// auto test_host_buffers = std::vector>{}; +// test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); +// test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); +// test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); +// for (auto& host_buf : test_host_buffers) { +// data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// } +// #endif +// } +// } TEST(Buffer, move_buffer) { From 338c1a60274fd6be7ff94ea854a4cc27d837a899 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 12 May 2023 09:46:43 -0700 Subject: [PATCH 20/75] Some updates --- cpp/include/raft/core/buffer.hpp | 34 ++-- .../detail/buffer_utils/owning_buffer_cpu.hpp | 5 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 6 +- cpp/test/core/buffer.cpp | 166 +++++++++--------- 4 files changed, 112 insertions(+), 99 deletions(-) diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/buffer.hpp index bfa4fafa06..9bfdc65f12 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/buffer.hpp @@ -38,7 +38,10 @@ namespace raft { * * @tparam ElementType type of the input * @tparam LayoutPolicy layout of the input - * @tparam ContainerPolicy container to be used to own host/device memory if needed. Users must ensure that the container has the correct type (host/device). Exceptions due to a device container being used for a host buffer and vice versa are not caught by the buffer class. + * @tparam ContainerPolicy container to be used to own host/device memory if needed. + * Users must ensure that the container has the correct type (host/device). Exceptions + * due to a device container being used for a host buffer and vice versa are not caught + * by the buffer class. * @tparam the index type of the extents */ template @@ -60,12 +63,15 @@ struct buffer { : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, + extents_{[size]() { + return make_extents(size); + }()}, data_{[this, mem_type, size, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{handle, size}; + result = detail::owning_buffer{handle, extents_}; } else { - result = detail::owning_buffer{handle, size}; + result = detail::owning_buffer{handle, extents_}; } return result; }()}, @@ -91,6 +97,9 @@ struct buffer { RAFT_LOG_INFO("Non owning constructor call started"); return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, + extents_{[size]() { + return make_extents(size); + }()}, data_{[this, input_data, mem_type]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { @@ -127,18 +136,19 @@ struct buffer { : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, + extents_{other.extents()}, data_{[this, &other, mem_type, handle]() { auto result = data_store{}; auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = - detail::owning_buffer(handle, other.size()); + detail::owning_buffer(handle, extents_); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("gpu copy called"); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { - auto buf = detail::owning_buffer(other.size()); + auto buf = detail::owning_buffer(handle, extents_); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("cpu copy called"); @@ -191,7 +201,8 @@ struct buffer { : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, - data_{[&other, mem_type, handle]() { + extents_{other.extents()}, + data_{[&other, mem_type, handle, this]() { auto result = data_store{}; if (mem_type == other.mem_type()) { result = std::move(other.data_); @@ -200,14 +211,14 @@ struct buffer { if (is_device_accessible(mem_type)) { auto buf = detail::owning_buffer{handle, other.size()}; + buffer_extent>{handle, extents_}; result_data = buf.get(); result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { auto buf = detail::owning_buffer{other.size()}; + buffer_extent>{handle, extents_}; result_data = buf.get(); result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); @@ -269,15 +280,15 @@ struct buffer { cached_ptr = std::move(other.cached_ptr); return *this; } - + auto extents() const noexcept { return extents_; } auto size() const noexcept { return size_; } HOST DEVICE auto* data_handle() const noexcept { auto result = static_cast(nullptr); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; + case 2: result = std::get<2>(data_).get(); break; + case 3: result = std::get<3>(data_).get(); break; } RAFT_LOG_INFO("data_handle() called: data %p; cached_ptr %p\n", result, cached_ptr); return result;} @@ -300,6 +311,7 @@ struct buffer { } enum device_type device_type_; + buffer_extent extents_; data_store data_; size_t size_; enum memory_type memory_type_; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index 1d3e196fbd..30f69b12df 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -34,12 +34,13 @@ struct owning_buffer { : extents_{extents}, data_{[&extents, handle]() { typename owning_host_buffer::mapping_type layout{extents}; typename owning_host_buffer::container_policy_type policy{}; - return owning_host_buffer{handle, layout, policy}; + owning_host_buffer host_data{handle, layout, policy}; + return host_data; }()} { } - auto* get() const { return reinterpret_cast(data_.data_handle()); } + auto* get() const { return const_cast(data_.data_handle()); } private: Extents extents_; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 37dcc423aa..19a9e6de6e 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -33,15 +33,15 @@ struct owning_buffer { owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) : extents_{extents}, data_{[&extents, handle]() { - // return rmm::device_buffer{size * sizeof(value_type), raft::resource::get_cuda_stream(handle)}; typename owning_device_buffer::mapping_type layout{extents}; typename owning_device_buffer::container_policy_type policy{}; - return owning_device_buffer{handle, layout, policy}; + owning_device_buffer device_data{handle, layout, policy}; + return device_data; }()} { } - auto* get() const { return reinterpret_cast(data_.data_handle()); } + auto* get() const { return const_cast(data_.data_handle()); } private: Extents extents_; diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index d2fa610f17..c2cc7855f7 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -31,34 +31,34 @@ TEST(Buffer, default_buffer) EXPECT_EQ(buf.size(), 0); } -// TEST(Buffer, device_buffer) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(handle, data.size(), memory_type::device); -// test_buffers.emplace_back(handle, data.size(), memory_type::device); -// test_buffers.emplace_back(handle, data.size(), memory_type::device); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::device); -// ASSERT_EQ(buf.size(), data.size()); -// #ifndef RAFT_DISABLE_GPU -// ASSERT_NE(buf.data_handle(), nullptr); - -// auto data_out = std::vector(data.size()); -// cudaMemcpy(static_cast(buf.data_handle()), -// static_cast(data.data()), -// sizeof(int) * data.size(), -// cudaMemcpyHostToDevice); -// cudaMemcpy(static_cast(data_out.data()), -// static_cast(buf.data_handle()), -// sizeof(int) * data.size(), -// cudaMemcpyDeviceToHost); -// EXPECT_THAT(data_out, testing::ElementsAreArray(data)); -// #endif -// } -// } +TEST(Buffer, device_buffer) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(handle, data.size(), memory_type::device); + // test_buffers.emplace_back(handle, data.size(), memory_type::device); + // test_buffers.emplace_back(handle, data.size(), memory_type::device); + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_EQ(buf.size(), data.size()); +#ifndef RAFT_DISABLE_GPU + ASSERT_NE(buf.data_handle(), nullptr); + + auto data_out = std::vector(data.size()); + cudaMemcpy(static_cast(buf.data_handle()), + static_cast(data.data()), + sizeof(int) * data.size(), + cudaMemcpyHostToDevice); + cudaMemcpy(static_cast(data_out.data()), + static_cast(buf.data_handle()), + sizeof(int) * data.size(), + cudaMemcpyDeviceToHost); + EXPECT_THAT(data_out, testing::ElementsAreArray(data)); +#endif + } +} TEST(Buffer, non_owning_device_buffer) { @@ -135,62 +135,62 @@ TEST(Buffer, non_owning_host_buffer) } } -// TEST(Buffer, copy_constructor) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); - -// // host to host copy operations -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(handle, orig_buffer); -// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); -// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); -// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); - -// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - -// #ifndef RAFT_DISABLE_GPU -// // host to device copy operations -// auto test_dev_buffers = std::vector>{}; -// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); -// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); -// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); -// for (auto& dev_buf : test_dev_buffers) { -// data_out = std::vector(data.size()); -// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +TEST(Buffer, copy_constructor) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); + + // host to host copy operations + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(handle, orig_buffer); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host); + test_buffers.emplace_back(handle, orig_buffer, memory_type::host); + + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); + + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + +#ifndef RAFT_DISABLE_GPU + // host to device copy operations + auto test_dev_buffers = std::vector>{}; + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); + test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); + for (auto& dev_buf : test_dev_buffers) { + data_out = std::vector(data.size()); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// // device to device copy operations -// auto test_dev_copies = std::vector>{}; -// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); -// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); -// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); -// for (auto& copy_buf : test_dev_copies) { -// data_out = std::vector(data.size()); -// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } - -// // device to host copy operations -// auto test_host_buffers = std::vector>{}; -// test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); -// test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); -// test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); -// for (auto& host_buf : test_host_buffers) { -// data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// } -// #endif -// } -// } + // device to device copy operations + auto test_dev_copies = std::vector>{}; + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); + test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); + // for (auto& copy_buf : test_dev_copies) { + // data_out = std::vector(data.size()); + // RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); + // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + // } + + // // device to host copy operations + // auto test_host_buffers = std::vector>{}; + // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); + // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); + // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); + // for (auto& host_buf : test_host_buffers) { + // data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); + // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + // } + } +#endif + } +} TEST(Buffer, move_buffer) { From 6468c2426818bd37ce22f6f4514b1808911dd9a9 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 7 Jun 2023 12:52:55 -0700 Subject: [PATCH 21/75] update container_policy --- .../raft/core/buffer_container_policy.hpp | 32 +++++ .../detail/buffer_utils/non_owning_buffer.hpp | 25 ++-- .../buffer_utils/owning_buffer_base.hpp | 6 +- .../detail/buffer_utils/owning_buffer_cpu.hpp | 12 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 12 +- .../raft/core/{buffer.hpp => mdbuffer.hpp} | 123 ++++++++++++------ cpp/test/core/buffer.cpp | 5 +- 7 files changed, 146 insertions(+), 69 deletions(-) create mode 100644 cpp/include/raft/core/buffer_container_policy.hpp rename cpp/include/raft/core/{buffer.hpp => mdbuffer.hpp} (74%) diff --git a/cpp/include/raft/core/buffer_container_policy.hpp b/cpp/include/raft/core/buffer_container_policy.hpp new file mode 100644 index 0000000000..55712cf55d --- /dev/null +++ b/cpp/include/raft/core/buffer_container_policy.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#ifndef RAFT_DISABLE_GPU +#include +#endif + +namespace raft { +#ifdef RAFT_DISABLE_GPU +template +using buffer_container_policy = std::variant>; +#else +template +using buffer_container_policy = std::variant, raft::device_uvector_policy>; +#endif +} \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index 2a8c58fce2..94052ef7fd 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -14,31 +14,32 @@ * limitations under the License. */ #pragma once -#include "raft/core/host_container_policy.hpp" -#include "raft/core/logger.hpp" -// #include "raft/core/mdspan.hpp" -#include -#include -// #include -// #include +#include +#include namespace raft { namespace detail { template + memory_type M, + typename Extents, + typename LayoutPolicy = layout_c_contiguous> struct non_owning_buffer { non_owning_buffer() : data_{nullptr} {} - non_owning_buffer(ElementType* ptr) : data_{ptr} { + non_owning_buffer(ElementType* ptr, Extents extents) : data_{ptr}, extents_{extents} { } - auto* get() const { return data_; } + auto* data_handle() const { return data_; } + auto* view() { + bool device_accessible = is_device_accessible(M); + bool host_accessible = is_host_accessible(M); + return make_mdspan(data_, extents_); + } private: - // TODO(wphicks): Back this with RMM-allocated host memory ElementType* data_; + Extents extents_; }; } // namespace detail diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp index 62f6b69195..6b7b1e44b1 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp @@ -22,8 +22,10 @@ namespace raft { namespace detail { template + device_type D, + typename Extents, + typename LayoutPolicy, + template typename ContainerPolicy> struct owning_buffer { owning_buffer() {} owning_buffer(raft::resources const& handle, Extents extents) {} diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index 30f69b12df..c49683b62b 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -24,12 +24,14 @@ namespace raft { namespace detail { -template -struct owning_buffer { + template typename ContainerPolicy = host_vector_policy> +struct owning_buffer { using element_type = std::remove_cv_t; - using container_policy = host_vector_policy; - using owning_host_buffer = host_mdarray; + using container_policy = ContainerPolicy; + using owning_host_buffer = host_mdarray; owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) : extents_{extents}, data_{[&extents, handle]() { typename owning_host_buffer::mapping_type layout{extents}; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 19a9e6de6e..8845da4bb8 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -22,12 +22,14 @@ namespace raft { namespace detail { -template -struct owning_buffer { + template typename ContainerPolicy = device_uvector_policy> +struct owning_buffer { using element_type = std::remove_cv_t; - using container_policy = device_uvector_policy; - using owning_device_buffer = device_mdarray; + using container_policy = ContainerPolicy; + using owning_device_buffer = device_mdarray; owning_buffer() : data_{} {} diff --git a/cpp/include/raft/core/buffer.hpp b/cpp/include/raft/core/mdbuffer.hpp similarity index 74% rename from cpp/include/raft/core/buffer.hpp rename to cpp/include/raft/core/mdbuffer.hpp index 9bfdc65f12..6b11c589c1 100644 --- a/cpp/include/raft/core/buffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -14,12 +14,14 @@ * limitations under the License. */ #pragma once +#include "raft/core/device_container_policy.hpp" #include "raft/core/logger.hpp" #include #include #include #include #include +#include #include #include #include @@ -44,44 +46,51 @@ namespace raft { * by the buffer class. * @tparam the index type of the extents */ -template +template typename ContainerPolicy = buffer_container_policy> struct buffer { - using buffer_extent = vector_extent; - using data_store = std::variant, - detail::non_owning_buffer, - detail::owning_buffer, - detail::owning_buffer>; + using data_store = std::variant, + detail::non_owning_buffer, + detail::non_owning_buffer, + detail::owning_buffer, + detail::owning_buffer>; - buffer() : device_type_{}, data_{}, size_{}, memory_type_{memory_type::host} {} + buffer() : device_type_{}, data_{}, length_{0}, memory_type_{memory_type::host} {} /** Construct non-initialized owning buffer. For owning buffers, managed memory is treated as * device memory only. Therefore, users are discouraged from using managed memory for creating * owning buffers. */ buffer(raft::resources const& handle, - size_t size, - memory_type mem_type = memory_type::host) + Extents extents, + memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, - extents_{[size]() { - return make_extents(size); - }()}, - data_{[this, mem_type, size, handle]() { + extents_{extents}, + length_([this]() { + std::size_t length = 1; + for (std::size_t i = 0; i < extents_.rank(); ++i) { + length *= extents_.extent(i); + } + return length; + }()), + data_{[this, mem_type, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{handle, extents_}; + result = detail::owning_buffer{handle, extents_}; } else { - result = detail::owning_buffer{handle, extents_}; + result = detail::owning_buffer{handle, extents_}; } return result; }()}, - size_{size}, memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); switch (data_.index()) { - case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; + case 4: result = std::get<4>(data_).get(); break; } return result; }()} @@ -89,27 +98,33 @@ struct buffer { } /** Construct non-owning buffer. Currently, users must ensure that the input_data is on the same device_type as the requested mem_type. - This cannot be asserted because checking the device id requires cuda headers (which is against the intended cuda-free build). If + This cannot be asserted because checking the device id requires CUDA headers (which is against the intended cpu-gpu interop). If the mem_type is different from the device_type of input_data, the input_data should first be copied to the appropriate location. For managed memory_type, input_data should be a managed pointer. */ - buffer(raft::resources const& handle, ElementType* input_data, size_t size, memory_type mem_type = memory_type::host) + buffer(raft::resources const& handle, ElementType* input_data, Extents extents, memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { RAFT_LOG_INFO("Non owning constructor call started"); return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, - extents_{[size]() { - return make_extents(size); - }()}, + extents_{extents}, + length_([this]() { + std::size_t length = 1; + for (std::size_t i = 0; i < extents_.rank(); ++i) { + length *= extents_.extent(i); + } + return length; + }()), data_{[this, input_data, mem_type]() { auto result = data_store{}; - if (is_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data}; + if (is_host_device_accessible(mem_type)) { + result = detail::non_owning_buffer{input_data, extents_}; + } else if (is_device_accessible(mem_type)) { + result = detail::non_owning_buffer{input_data, extents_}; } else { - result = detail::non_owning_buffer{input_data}; + result = detail::non_owning_buffer{input_data, extents_}; } return result; }()}, - size_{size}, memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -137,18 +152,25 @@ struct buffer { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, extents_{other.extents()}, + length_([this]() { + std::size_t length = 1; + for (std::size_t i = 0; i < extents_.rank(); ++i) { + length *= extents_.extent(i); + } + return length; + }()), data_{[this, &other, mem_type, handle]() { auto result = data_store{}; auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = - detail::owning_buffer(handle, extents_); + detail::owning_buffer(handle, extents_); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("gpu copy called"); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { - auto buf = detail::owning_buffer(handle, extents_); + auto buf = detail::owning_buffer(handle, extents_); result_data = buf.get(); result = std::move(buf); RAFT_LOG_INFO("cpu copy called"); @@ -156,7 +178,6 @@ struct buffer { } return result; }()}, - size_{other.size()}, memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -170,7 +191,7 @@ struct buffer { RAFT_LOG_INFO("Pointer to other's data %p\n", other.data_handle()); } - friend void swap(buffer& first, buffer& second) + friend void swap(buffer& first, buffer& second) { using std::swap; swap(first.device_type_, second.device_type_); @@ -179,7 +200,7 @@ struct buffer { swap(first.memory_type_, second.memory_type_); swap(first.cached_ptr, second.cached_ptr); } - buffer& operator=(buffer const& other) { + buffer& operator=(buffer const& other) { auto copy = other; swap(*this, copy); return *this; @@ -189,7 +210,7 @@ struct buffer { * @brief Create owning copy of existing buffer with given stream * The device type of this new buffer will be the same as the original */ - buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type()) + buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type()) { } @@ -197,7 +218,7 @@ struct buffer { * @brief Move from existing buffer unless a copy is necessary based on * memory location */ - buffer(raft::resources const& handle, buffer&& other, memory_type mem_type) + buffer(raft::resources const& handle, buffer&& other, memory_type mem_type) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, @@ -211,14 +232,16 @@ struct buffer { if (is_device_accessible(mem_type)) { auto buf = detail::owning_buffer{handle, extents_}; + Extents, + LayoutPolicy, + ContainerPolicy>{handle, extents_}; result_data = buf.get(); result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { auto buf = detail::owning_buffer{handle, extents_}; + Extents, LayoutPolicy, ContainerPolicy>{handle, extents_}; result_data = buf.get(); result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); @@ -226,7 +249,6 @@ struct buffer { } return result; }()}, - size_{other.size()}, memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -235,6 +257,7 @@ struct buffer { case 1: result = std::get<1>(data_).get(); break; case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; + case 4: result = std::get<4>(data_).get(); break; } return result; }()} @@ -247,7 +270,7 @@ struct buffer { // RAFT_LOG_INFO("copy constructor without stream and device called"); // } - buffer(buffer&& other) noexcept + buffer(buffer&& other) noexcept : device_type_{[&other]() { return is_device_accessible(other.mem_type()) ? device_type::gpu : device_type::cpu; }()}, @@ -256,7 +279,7 @@ struct buffer { result = std::move(other.data_); return result; }()}, - size_{other.size()}, + extents_{other.extents_}, memory_type_{other.mem_type()}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -271,17 +294,16 @@ struct buffer { { RAFT_LOG_INFO("trivial move called"); } - buffer& operator=(buffer&& other) noexcept { + buffer& operator=(buffer&& other) noexcept { RAFT_LOG_INFO("operator= move called"); data_ = std::move(other.data_); device_type_ = std::move(other.device_type_); - size_ = std::move(other.size_); + extents_ = std::move(other.extents_); memory_type_ = std::move(other.memory_type_); cached_ptr = std::move(other.cached_ptr); return *this; } auto extents() const noexcept { return extents_; } - auto size() const noexcept { return size_; } HOST DEVICE auto* data_handle() const noexcept { auto result = static_cast(nullptr); switch (data_.index()) { @@ -289,6 +311,7 @@ struct buffer { case 1: result = std::get<1>(data_).get(); break; case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; + case 4: result = std::get<4>(data_).get(); break; } RAFT_LOG_INFO("data_handle() called: data %p; cached_ptr %p\n", result, cached_ptr); return result;} @@ -304,6 +327,20 @@ struct buffer { // return make_mdspan mem_type()), is_device_accessible(this -> mem_type())>(data_, make_extents(size_)); // } + HOST DEVICE auto view() const noexcept { + if (data_.index() == 0) + return std::get<0>(data_).view(); + if (data_.index() == 1) + return std::get<1>(data_).view(); + if (data_.index() == 2) + return std::get<2>(data_).view(); + if (data_.index() == 3) + return std::get<3>(data_).view(); + if (data_.index() == 4) + return std::get<4>(data_).view(); + } + + auto size() {return length_;} private: auto dev_type() const noexcept { @@ -311,9 +348,9 @@ struct buffer { } enum device_type device_type_; - buffer_extent extents_; + Extents extents_; data_store data_; - size_t size_; + size_t length_; enum memory_type memory_type_; ElementType* cached_ptr; }; diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index c2cc7855f7..879530f40c 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,7 +26,8 @@ namespace raft { TEST(Buffer, default_buffer) { - auto buf = buffer(); + auto exts = raft::make_extents(5); + auto buf = buffer(); EXPECT_EQ(buf.mem_type(), memory_type::host); EXPECT_EQ(buf.size(), 0); } From 81c6a81fae66148ca90b43cda582ddbfea608534 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 8 Jun 2023 17:09:46 -0700 Subject: [PATCH 22/75] Working build --- .../detail/buffer_utils/non_owning_buffer.hpp | 15 +- .../detail/buffer_utils/owning_buffer_cpu.hpp | 16 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 18 +- cpp/include/raft/core/mdbuffer.hpp | 136 ++--- cpp/test/core/buffer.cpp | 518 +++++++++--------- 5 files changed, 362 insertions(+), 341 deletions(-) diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index 94052ef7fd..fc704ea71c 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -24,18 +24,23 @@ template struct non_owning_buffer { + using index_type = typename Extents::index_type; non_owning_buffer() : data_{nullptr} {} non_owning_buffer(ElementType* ptr, Extents extents) : data_{ptr}, extents_{extents} { } - auto* data_handle() const { return data_; } + auto* get() const { return data_; } - auto* view() { - bool device_accessible = is_device_accessible(M); - bool host_accessible = is_host_accessible(M); - return make_mdspan(data_, extents_); + auto view() { + if (is_host_device_accessible(M)) { + return make_mdspan(data_, extents_); + } else if (is_device_accessible(M)) { + return make_mdspan(data_, extents_); + } else { + return make_mdspan(data_, extents_); + } } private: ElementType* data_; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index c49683b62b..6d23d20436 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -20,17 +20,22 @@ #include #include #include +#include #include +#include namespace raft { namespace detail { template typename ContainerPolicy = host_vector_policy> + typename LayoutPolicy, + template typename ContainerPolicy> struct owning_buffer { using element_type = std::remove_cv_t; - using container_policy = ContainerPolicy; + using container_policy = std::conditional_t, ContainerPolicy>, + std::variant_alternative_t<0, buffer_container_policy>, + ContainerPolicy>; + using index_type = typename Extents::index_type; using owning_host_buffer = host_mdarray; owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) : extents_{extents}, data_{[&extents, handle]() { @@ -44,6 +49,11 @@ struct owning_buffer(data_.data_handle()); } + auto view() { + return make_mdspan(data_.data_handle(), + extents_); + } + private: Extents extents_; owning_host_buffer data_; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 8845da4bb8..414b444100 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -14,21 +14,27 @@ * limitations under the License. */ #pragma once +#include "raft/core/logger.hpp" #include "owning_buffer_base.hpp" #include #include #include #include +#include +#include namespace raft { namespace detail { template typename ContainerPolicy = device_uvector_policy> + typename LayoutPolicy, + template typename ContainerPolicy> struct owning_buffer { using element_type = std::remove_cv_t; - using container_policy = ContainerPolicy; + using container_policy = std::conditional_t, ContainerPolicy>, + std::variant_alternative_t<1, buffer_container_policy>, + ContainerPolicy>; + using index_type = typename Extents::index_type; using owning_device_buffer = device_mdarray; owning_buffer() : data_{} {} @@ -43,8 +49,12 @@ struct owning_buffer(data_.data_handle()); } + auto* get() const {return const_cast(data_.data_handle());} + auto view() { + return make_mdspan(data_.data_handle(), + extents_); + } private: Extents extents_; owning_device_buffer data_; diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 6b11c589c1..eaeb3ffa5a 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -14,7 +14,6 @@ * limitations under the License. */ #pragma once -#include "raft/core/device_container_policy.hpp" #include "raft/core/logger.hpp" #include #include @@ -69,13 +68,6 @@ struct buffer { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, extents_{extents}, - length_([this]() { - std::size_t length = 1; - for (std::size_t i = 0; i < extents_.rank(); ++i) { - length *= extents_.extent(i); - } - return length; - }()), data_{[this, mem_type, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { @@ -85,6 +77,13 @@ struct buffer { } return result; }()}, + length_([this]() { + size_t length = 1; + for (size_t i = 0; i < extents_.rank(); ++i) { + length *= extents_.extent(i); + } + return length; + }()), memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -107,13 +106,6 @@ struct buffer { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, extents_{extents}, - length_([this]() { - std::size_t length = 1; - for (std::size_t i = 0; i < extents_.rank(); ++i) { - length *= extents_.extent(i); - } - return length; - }()), data_{[this, input_data, mem_type]() { auto result = data_store{}; if (is_host_device_accessible(mem_type)) { @@ -125,6 +117,13 @@ struct buffer { } return result; }()}, + length_([this]() { + std::size_t length = 1; + for (std::size_t i = 0; i < extents_.rank(); ++i) { + length *= extents_.extent(i); + } + return length; + }()), memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -152,13 +151,6 @@ struct buffer { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, extents_{other.extents()}, - length_([this]() { - std::size_t length = 1; - for (std::size_t i = 0; i < extents_.rank(); ++i) { - length *= extents_.extent(i); - } - return length; - }()), data_{[this, &other, mem_type, handle]() { auto result = data_store{}; auto result_data = static_cast(nullptr); @@ -178,6 +170,13 @@ struct buffer { } return result; }()}, + length_([this]() { + std::size_t length = 1; + for (std::size_t i = 0; i < extents_.rank(); ++i) { + length *= extents_.extent(i); + } + return length; + }()), memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -264,22 +263,18 @@ struct buffer { { RAFT_LOG_INFO("main move called"); } - // buffer(buffer&& other, device_type mem_type) - // : buffer{std::move(other), mem_type, 0, execution_stream{}} - // { - // RAFT_LOG_INFO("copy constructor without stream and device called"); - // } buffer(buffer&& other) noexcept : device_type_{[&other]() { return is_device_accessible(other.mem_type()) ? device_type::gpu : device_type::cpu; }()}, + extents_{other.extents_}, data_{[&other]() { auto result = data_store{}; result = std::move(other.data_); return result; }()}, - extents_{other.extents_}, + length_{other.length_}, memory_type_{other.mem_type()}, cached_ptr{[this]() { auto result = static_cast(nullptr); @@ -288,6 +283,7 @@ struct buffer { case 1: result = std::get<1>(data_).get(); break; case 2: result = std::get<2>(data_).get(); break; case 3: result = std::get<3>(data_).get(); break; + case 4: result = std::get<4>(data_).get(); break; } return result; }()} @@ -296,25 +292,29 @@ struct buffer { } buffer& operator=(buffer&& other) noexcept { RAFT_LOG_INFO("operator= move called"); - data_ = std::move(other.data_); device_type_ = std::move(other.device_type_); extents_ = std::move(other.extents_); + data_ = std::move(other.data_); + length_ = std::move(other.size()); memory_type_ = std::move(other.memory_type_); cached_ptr = std::move(other.cached_ptr); return *this; } auto extents() const noexcept { return extents_; } HOST DEVICE auto* data_handle() const noexcept { - auto result = static_cast(nullptr); - switch (data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - case 4: result = std::get<4>(data_).get(); break; - } - RAFT_LOG_INFO("data_handle() called: data %p; cached_ptr %p\n", result, cached_ptr); - return result;} + // auto result = static_cast(nullptr); + // switch (data_.index()) { + // case 0: {RAFT_LOG_INFO("0th"); result = std::get<0>(data_).get(); break;} + // case 1: {RAFT_LOG_INFO("1th"); result = std::get<1>(data_).get(); break;} + // case 2: {RAFT_LOG_INFO("2th"); result = std::get<2>(data_).get(); break;} + // case 3: {RAFT_LOG_INFO("3th"); result = std::get<3>(data_).get(); break;} + // case 4: {RAFT_LOG_INFO("4th"); result = std::get<4>(data_).get(); break;} + + // } + // RAFT_LOG_INFO("data_handle() called: data %p; cached_ptr %p\n", result, cached_ptr); + // return result; + return cached_ptr; + } auto mem_type() const noexcept { @@ -355,34 +355,34 @@ struct buffer { ElementType* cached_ptr; }; -template -detail::const_agnostic_same_t copy(raft::resources const& handle, - buffer & dst, - buffer const& src, - size_t dst_offset, - size_t src_offset, - size_t size) -{ - if constexpr (bounds_check) { - if (src.size() - src_offset < size || dst.size() - dst_offset < size) { - throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); - } - } - auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; - auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; - detail::buffer_copy(handle, - dst.data_handle() + dst_offset, - src.data_handle() + src_offset, - size, - dst_device_type, - src_device_type); -} +// template +// detail::const_agnostic_same_t copy(raft::resources const& handle, +// buffer & dst, +// buffer const& src, +// size_t dst_offset, +// size_t src_offset, +// size_t size) +// { +// if constexpr (bounds_check) { +// if (src.size() - src_offset < size || dst.size() - dst_offset < size) { +// throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); +// } +// } +// auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; +// auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; +// detail::buffer_copy(handle, +// dst.data_handle() + dst_offset, +// src.data_handle() + src_offset, +// size, +// dst_device_type, +// src_device_type); +// } -template -detail::const_agnostic_same_t copy(raft::resources const& handle, - buffer& dst, - buffer const& src) -{ - copy(handle, dst, src, 0, 0, src.size()); -} +// template +// detail::const_agnostic_same_t copy(raft::resources const& handle, +// buffer& dst, +// buffer const& src) +// { +// copy(handle, dst, src, 0, 0, src.size()); +// } } // namespace raft \ No newline at end of file diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index 879530f40c..aa83b1514b 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -22,6 +22,8 @@ #include #include +#include + namespace raft { TEST(Buffer, default_buffer) @@ -36,281 +38,275 @@ TEST(Buffer, device_buffer) { raft::resources handle; auto data = std::vector{1, 2, 3}; - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, data.size(), memory_type::device); - // test_buffers.emplace_back(handle, data.size(), memory_type::device); - // test_buffers.emplace_back(handle, data.size(), memory_type::device); + auto exts = raft::make_extents(data.size()); + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(handle, exts, memory_type::device); + test_buffers.emplace_back(handle, exts, memory_type::device); + test_buffers.emplace_back(handle, exts, memory_type::device); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::device); ASSERT_EQ(buf.size(), data.size()); #ifndef RAFT_DISABLE_GPU ASSERT_NE(buf.data_handle(), nullptr); - auto data_out = std::vector(data.size()); - cudaMemcpy(static_cast(buf.data_handle()), - static_cast(data.data()), - sizeof(int) * data.size(), - cudaMemcpyHostToDevice); - cudaMemcpy(static_cast(data_out.data()), - static_cast(buf.data_handle()), - sizeof(int) * data.size(), - cudaMemcpyDeviceToHost); + raft::update_device(buf.data_handle(), data.data(), data.size(), raft::resource::get_cuda_stream(handle)); + raft::update_host(data_out.data(), buf.data_handle(), buf.size(), raft::resource::get_cuda_stream(handle)); EXPECT_THAT(data_out, testing::ElementsAreArray(data)); #endif } } -TEST(Buffer, non_owning_device_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto* ptr_d = static_cast(nullptr); -#ifndef RAFT_DISABLE_GPU - cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); - cudaMemcpy(static_cast(ptr_d), - static_cast(data.data()), - sizeof(int) * data.size(), - cudaMemcpyHostToDevice); -#endif - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); - test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); -#ifndef RAFT_DISABLE_GPU - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data_handle(), ptr_d); - - auto data_out = std::vector(data.size()); - cudaMemcpy(static_cast(data_out.data()), - static_cast(buf.data_handle()), - sizeof(int) * data.size(), - cudaMemcpyDeviceToHost); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } - cudaFree(reinterpret_cast(ptr_d)); -#endif -} - -TEST(Buffer, host_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, data.size(), memory_type::host); - test_buffers.emplace_back(handle, data.size(), memory_type::host); - test_buffers.emplace_back(handle, data.size(), memory_type::host); - test_buffers.emplace_back(handle, data.size()); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data_handle(), nullptr); - - std::memcpy( - static_cast(buf.data_handle()), static_cast(data.data()), data.size() * sizeof(int)); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -} - -TEST(Buffer, non_owning_host_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - std::vector> test_buffers; - test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); - test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); - test_buffers.emplace_back(handle, data.data(), data.size()); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data_handle(), data.data()); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -} - -TEST(Buffer, copy_constructor) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); - - // host to host copy operations - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, orig_buffer); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - -#ifndef RAFT_DISABLE_GPU - // host to device copy operations - auto test_dev_buffers = std::vector>{}; - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); - test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); - for (auto& dev_buf : test_dev_buffers) { - data_out = std::vector(data.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// TEST(Buffer, non_owning_device_buffer) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// auto* ptr_d = static_cast(nullptr); +// #ifndef RAFT_DISABLE_GPU +// cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); +// cudaMemcpy(static_cast(ptr_d), +// static_cast(data.data()), +// sizeof(int) * data.size(), +// cudaMemcpyHostToDevice); +// #endif +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); +// test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); +// #ifndef RAFT_DISABLE_GPU + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::device); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_EQ(buf.data_handle(), ptr_d); + +// auto data_out = std::vector(data.size()); +// cudaMemcpy(static_cast(data_out.data()), +// static_cast(buf.data_handle()), +// sizeof(int) * data.size(), +// cudaMemcpyDeviceToHost); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// cudaFree(reinterpret_cast(ptr_d)); +// #endif +// } + +// TEST(Buffer, host_buffer) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(handle, data.size(), memory_type::host); +// test_buffers.emplace_back(handle, data.size(), memory_type::host); +// test_buffers.emplace_back(handle, data.size(), memory_type::host); +// test_buffers.emplace_back(handle, data.size()); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data_handle(), nullptr); + +// std::memcpy( +// static_cast(buf.data_handle()), static_cast(data.data()), data.size() * sizeof(int)); + +// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// } + +// TEST(Buffer, non_owning_host_buffer) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// std::vector> test_buffers; +// test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); +// test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); +// test_buffers.emplace_back(handle, data.data(), data.size()); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_EQ(buf.data_handle(), data.data()); + +// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// } + +// TEST(Buffer, copy_constructor) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); + +// // host to host copy operations +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(handle, orig_buffer); +// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); +// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); +// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); + +// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + +// #ifndef RAFT_DISABLE_GPU +// // host to device copy operations +// auto test_dev_buffers = std::vector>{}; +// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); +// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); +// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); +// for (auto& dev_buf : test_dev_buffers) { +// data_out = std::vector(data.size()); +// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - // device to device copy operations - auto test_dev_copies = std::vector>{}; - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); - test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); - // for (auto& copy_buf : test_dev_copies) { - // data_out = std::vector(data.size()); - // RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); - // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - // } - - // // device to host copy operations - // auto test_host_buffers = std::vector>{}; - // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); - // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); - // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); - // for (auto& host_buf : test_host_buffers) { - // data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); - // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - // } - } -#endif - } -} - -TEST(Buffer, move_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(buffer(handle, data.data(), data.size(), memory_type::host)); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data_handle(), data.data()); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -#ifndef RAFT_DISABLE_GPU - test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); - test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data_handle(), data.data()); - - auto data_out = std::vector(buf.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data_handle()), buf.size() * sizeof(int), cudaMemcpyDefault)); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -#endif -} - -TEST(Buffer, move_assignment_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - -#ifndef RAFT_DISABLE_GPU - auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::device}; -#else - auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::host}; -#endif - buf = buffer{handle, data.size(), memory_type::host}; - - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); -} - -TEST(Buffer, partial_buffer_copy) -{ - raft::resources handle; - auto data1 = std::vector{1, 2, 3, 4, 5}; - auto data2 = std::vector{0, 0, 0, 0, 0}; - auto expected = std::vector{0, 3, 4, 5, 0}; -#ifndef RAFT_DISABLE_GPU - auto buf1 = buffer{handle, buffer{handle, data1.data(), data1.size(), memory_type::host}, memory_type::device}; -#else - auto buf1 = buffer{handle, data1.data(), data1.size(), memory_type::host}; -#endif - auto buf2 = buffer{handle, data2.data(), data2.size(), memory_type::host}; - copy(handle, buf2, buf1, 1, 2, 3); - copy(handle, buf2, buf1, 1, 2, 3); - EXPECT_THROW(copy(handle, buf2, buf1, 1, 2, 4), out_of_bounds); -} - -TEST(Buffer, buffer_copy_overloads) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto expected = data; - auto orig_host_buffer = buffer(handle, data.data(), data.size(), memory_type::host); - auto orig_dev_buffer = buffer(handle, orig_host_buffer, memory_type::device); - auto copy_dev_buffer = buffer(handle, data.size(), memory_type::device); +// // device to device copy operations +// auto test_dev_copies = std::vector>{}; +// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); +// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); +// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); +// // for (auto& copy_buf : test_dev_copies) { +// // data_out = std::vector(data.size()); +// // RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); +// // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// // } + +// // // device to host copy operations +// // auto test_host_buffers = std::vector>{}; +// // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); +// // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); +// // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); +// // for (auto& host_buf : test_host_buffers) { +// // data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); +// // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// // } +// } +// #endif +// } +// } + +// TEST(Buffer, move_buffer) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// auto test_buffers = std::vector>{}; +// test_buffers.emplace_back(buffer(handle, data.data(), data.size(), memory_type::host)); +// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); +// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); +// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); + +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_EQ(buf.data_handle(), data.data()); + +// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// #ifndef RAFT_DISABLE_GPU +// test_buffers = std::vector>{}; +// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); +// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); +// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); +// for (auto& buf : test_buffers) { +// ASSERT_EQ(buf.mem_type(), memory_type::device); +// ASSERT_EQ(buf.size(), data.size()); +// ASSERT_NE(buf.data_handle(), data.data()); + +// auto data_out = std::vector(buf.size()); +// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data_handle()), buf.size() * sizeof(int), cudaMemcpyDefault)); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); +// } +// #endif +// } + +// TEST(Buffer, move_assignment_buffer) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; + +// #ifndef RAFT_DISABLE_GPU +// auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::device}; +// #else +// auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::host}; +// #endif +// buf = buffer{handle, data.size(), memory_type::host}; + +// ASSERT_EQ(buf.mem_type(), memory_type::host); +// ASSERT_EQ(buf.size(), data.size()); +// } + +// TEST(Buffer, partial_buffer_copy) +// { +// raft::resources handle; +// auto data1 = std::vector{1, 2, 3, 4, 5}; +// auto data2 = std::vector{0, 0, 0, 0, 0}; +// auto expected = std::vector{0, 3, 4, 5, 0}; +// #ifndef RAFT_DISABLE_GPU +// auto buf1 = buffer{handle, buffer{handle, data1.data(), data1.size(), memory_type::host}, memory_type::device}; +// #else +// auto buf1 = buffer{handle, data1.data(), data1.size(), memory_type::host}; +// #endif +// auto buf2 = buffer{handle, data2.data(), data2.size(), memory_type::host}; +// copy(handle, buf2, buf1, 1, 2, 3); +// copy(handle, buf2, buf1, 1, 2, 3); +// EXPECT_THROW(copy(handle, buf2, buf1, 1, 2, 4), out_of_bounds); +// } + +// TEST(Buffer, buffer_copy_overloads) +// { +// raft::resources handle; +// auto data = std::vector{1, 2, 3}; +// auto expected = data; +// auto orig_host_buffer = buffer(handle, data.data(), data.size(), memory_type::host); +// auto orig_dev_buffer = buffer(handle, orig_host_buffer, memory_type::device); +// auto copy_dev_buffer = buffer(handle, data.size(), memory_type::device); - // copying host to host - auto data_out = std::vector(data.size()); - auto copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); - copy(handle, copy_host_buffer, orig_host_buffer); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - - // copying host to host with stream - data_out = std::vector(data.size()); - copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); - copy(handle, copy_host_buffer, orig_host_buffer); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - - // copying host to host with offset - data_out = std::vector(data.size() + 1); - copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); - copy(handle, copy_host_buffer, orig_host_buffer, 2, 1, 1); - expected = std::vector{0, 0, 2, 0}; - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -#ifndef RAFT_DISABLE_GPU - // copy device to host - data_out = std::vector(data.size()); - copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); - copy(handle, copy_host_buffer, orig_dev_buffer); - expected = data; - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - - // copy device to host with stream - data_out = std::vector(data.size()); - copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); - copy(handle, copy_host_buffer, orig_dev_buffer); - expected = data; - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); +// // copying host to host +// auto data_out = std::vector(data.size()); +// auto copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy(handle, copy_host_buffer, orig_host_buffer); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// // copying host to host with stream +// data_out = std::vector(data.size()); +// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy(handle, copy_host_buffer, orig_host_buffer); +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// // copying host to host with offset +// data_out = std::vector(data.size() + 1); +// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy(handle, copy_host_buffer, orig_host_buffer, 2, 1, 1); +// expected = std::vector{0, 0, 2, 0}; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// #ifndef RAFT_DISABLE_GPU +// // copy device to host +// data_out = std::vector(data.size()); +// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy(handle, copy_host_buffer, orig_dev_buffer); +// expected = data; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + +// // copy device to host with stream +// data_out = std::vector(data.size()); +// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy(handle, copy_host_buffer, orig_dev_buffer); +// expected = data; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - // copy device to host with offset - data_out = std::vector(data.size() + 1); - copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); - copy(handle, copy_host_buffer, orig_dev_buffer, 2, 1, 1); - expected = std::vector{0, 0, 2, 0}; - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); -#endif -} +// // copy device to host with offset +// data_out = std::vector(data.size() + 1); +// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy(handle, copy_host_buffer, orig_dev_buffer, 2, 1, 1); +// expected = std::vector{0, 0, 2, 0}; +// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); +// #endif +// } } \ No newline at end of file From 451815e5762d7ddab52c1084f45db9091341b31f Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 12 Jun 2023 15:48:16 -0700 Subject: [PATCH 23/75] Update buffer accessor policy --- .../detail/buffer_utils/non_owning_buffer.hpp | 21 ++- .../detail/buffer_utils/owning_buffer_cpu.hpp | 3 +- .../detail/buffer_utils/owning_buffer_gpu.hpp | 3 +- cpp/include/raft/core/mdbuffer.hpp | 12 +- cpp/test/core/buffer.cpp | 139 +++++++++--------- 5 files changed, 93 insertions(+), 85 deletions(-) diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp index fc704ea71c..a5c9244a00 100644 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp @@ -14,16 +14,25 @@ * limitations under the License. */ #pragma once +#include "raft/core/buffer_container_policy.hpp" +#include "raft/core/host_container_policy.hpp" +#include "raft/core/host_device_accessor.hpp" #include #include +#include namespace raft { namespace detail { template + typename LayoutPolicy = layout_c_contiguous, + template typename ContainerPolicy = buffer_container_policy> struct non_owning_buffer { + using container_policy = std::conditional_t, ContainerPolicy>, + std::variant_alternative_t<0, buffer_container_policy>, + ContainerPolicy>; + using accessor_policy = typename container_policy::accessor_policy; using index_type = typename Extents::index_type; non_owning_buffer() : data_{nullptr} {} @@ -34,13 +43,9 @@ struct non_owning_buffer { auto* get() const { return data_; } auto view() { - if (is_host_device_accessible(M)) { - return make_mdspan(data_, extents_); - } else if (is_device_accessible(M)) { - return make_mdspan(data_, extents_); - } else { - return make_mdspan(data_, extents_); - } + using accessor_type = host_device_accessor< + accessor_policy, M>(); + return mdspan{data_, extents_}; } private: ElementType* data_; diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp index 6d23d20436..fa8205b2ed 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp @@ -50,8 +50,7 @@ struct owning_buffer(data_.data_handle()); } auto view() { - return make_mdspan(data_.data_handle(), - extents_); + return data_.view(); } private: diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp index 414b444100..7ef0c86396 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp @@ -52,8 +52,7 @@ struct owning_buffer(data_.data_handle());} auto view() { - return make_mdspan(data_.data_handle(), - extents_); + data_.view(); } private: Extents extents_; diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index eaeb3ffa5a..0918f89ef4 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -50,9 +50,9 @@ template typename ContainerPolicy = buffer_container_policy> struct buffer { - using data_store = std::variant, - detail::non_owning_buffer, - detail::non_owning_buffer, + using data_store = std::variant, + detail::non_owning_buffer, + detail::non_owning_buffer, detail::owning_buffer, detail::owning_buffer>; @@ -109,11 +109,11 @@ struct buffer { data_{[this, input_data, mem_type]() { auto result = data_store{}; if (is_host_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data, extents_}; + result = detail::non_owning_buffer{input_data, extents_}; } else if (is_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data, extents_}; + result = detail::non_owning_buffer{input_data, extents_}; } else { - result = detail::non_owning_buffer{input_data, extents_}; + result = detail::non_owning_buffer{input_data, extents_}; } return result; }()}, diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index aa83b1514b..d380d65d09 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "raft/core/mdspan.hpp" #include #include #include @@ -28,10 +29,10 @@ namespace raft { TEST(Buffer, default_buffer) { - auto exts = raft::make_extents(5); - auto buf = buffer(); + auto buf = buffer>(); EXPECT_EQ(buf.mem_type(), memory_type::host); EXPECT_EQ(buf.size(), 0); + ASSERT_NE(buf.data_handle(), nullptr); } TEST(Buffer, device_buffer) @@ -57,80 +58,84 @@ TEST(Buffer, device_buffer) } } -// TEST(Buffer, non_owning_device_buffer) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// auto* ptr_d = static_cast(nullptr); -// #ifndef RAFT_DISABLE_GPU -// cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); -// cudaMemcpy(static_cast(ptr_d), -// static_cast(data.data()), -// sizeof(int) * data.size(), -// cudaMemcpyHostToDevice); -// #endif -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); -// test_buffers.emplace_back(handle, ptr_d, data.size(), memory_type::device); -// #ifndef RAFT_DISABLE_GPU +TEST(Buffer, non_owning_device_buffer) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + auto exts = raft::make_extents(data.size()); + auto* ptr_d = static_cast(nullptr); +#ifndef RAFT_DISABLE_GPU + cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); + cudaMemcpy(static_cast(ptr_d), + static_cast(data.data()), + sizeof(int) * data.size(), + cudaMemcpyHostToDevice); +#endif + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(handle, ptr_d, exts, memory_type::device); + test_buffers.emplace_back(handle, ptr_d, exts, memory_type::device); +#ifndef RAFT_DISABLE_GPU -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::device); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_EQ(buf.data_handle(), ptr_d); + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_EQ(buf.data_handle(), ptr_d); -// auto data_out = std::vector(data.size()); -// cudaMemcpy(static_cast(data_out.data()), -// static_cast(buf.data_handle()), -// sizeof(int) * data.size(), -// cudaMemcpyDeviceToHost); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// cudaFree(reinterpret_cast(ptr_d)); -// #endif -// } + auto data_out = std::vector(data.size()); + cudaMemcpy(static_cast(data_out.data()), + static_cast(buf.data_handle()), + sizeof(int) * data.size(), + cudaMemcpyDeviceToHost); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } + cudaFree(reinterpret_cast(ptr_d)); +#endif +} -// TEST(Buffer, host_buffer) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(handle, data.size(), memory_type::host); -// test_buffers.emplace_back(handle, data.size(), memory_type::host); -// test_buffers.emplace_back(handle, data.size(), memory_type::host); -// test_buffers.emplace_back(handle, data.size()); +TEST(Buffer, host_buffer) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + auto exts = raft::make_extents(data.size()); -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data_handle(), nullptr); + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(handle, exts, memory_type::host); + test_buffers.emplace_back(handle, exts, memory_type::host); + test_buffers.emplace_back(handle, exts, memory_type::host); + test_buffers.emplace_back(handle, exts); -// std::memcpy( -// static_cast(buf.data_handle()), static_cast(data.data()), data.size() * sizeof(int)); + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_NE(buf.data_handle(), nullptr); -// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// } + std::memcpy( + static_cast(buf.data_handle()), static_cast(data.data()), data.size() * sizeof(int)); -// TEST(Buffer, non_owning_host_buffer) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// std::vector> test_buffers; -// test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); -// test_buffers.emplace_back(handle, data.data(), data.size(), memory_type::host); -// test_buffers.emplace_back(handle, data.data(), data.size()); + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +} -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_EQ(buf.data_handle(), data.data()); +TEST(Buffer, non_owning_host_buffer) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + auto exts = raft::make_extents(data.size()); + std::vector> test_buffers; + test_buffers.emplace_back(handle, data.data(), exts, memory_type::host); + test_buffers.emplace_back(handle, data.data(), exts, memory_type::host); + test_buffers.emplace_back(handle, data.data(), exts); -// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// } + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_EQ(buf.data_handle(), data.data()); + + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +} // TEST(Buffer, copy_constructor) // { From b410f367e89b3421cf98ba7354bc597dc59f89b3 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 12 Jun 2023 15:50:03 -0700 Subject: [PATCH 24/75] Style changes --- cpp/test/core/buffer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index d380d65d09..8cac43f630 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "raft/core/mdspan.hpp" #include #include #include #include +#include #include #include #include From 4731620c255e6d36b23f153941a2cf6d21509e2b Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 13 Jun 2023 10:19:16 -0700 Subject: [PATCH 25/75] minor changes --- .../core/detail/buffer_utils/copy_cpu.hpp | 4 +- cpp/include/raft/core/error.hpp | 5 - cpp/include/raft/core/mdbuffer.hpp | 75 ++++++--------- cpp/test/core/buffer.cpp | 95 ++++++++++--------- 4 files changed, 82 insertions(+), 97 deletions(-) diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp index 5f879710fb..e2b0280ec8 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp @@ -34,8 +34,8 @@ copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) template std::enable_if_t< - std::conjunction_v, - std::bool_constant>, + std::conjunction_v, + std::bool_constant>, std::bool_constant>, void> copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 73f4813841..1fe62a8056 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -126,11 +126,6 @@ struct mem_type_mismatch : logic_error { mem_type_mismatch() : mem_type_mismatch("Memory type does not match expected type") {} explicit mem_type_mismatch(char const* msg) : logic_error(msg) {} }; - -struct wrong_device : logic_error { - wrong_device() : wrong_device("Attempted to use incorrect device") {} - explicit wrong_device(char const* msg) : logic_error(msg) {} -}; } // namespace raft // FIXME: Need to be replaced with RAFT_FAIL diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 0918f89ef4..5a7980a57a 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -302,19 +302,8 @@ struct buffer { } auto extents() const noexcept { return extents_; } HOST DEVICE auto* data_handle() const noexcept { - // auto result = static_cast(nullptr); - // switch (data_.index()) { - // case 0: {RAFT_LOG_INFO("0th"); result = std::get<0>(data_).get(); break;} - // case 1: {RAFT_LOG_INFO("1th"); result = std::get<1>(data_).get(); break;} - // case 2: {RAFT_LOG_INFO("2th"); result = std::get<2>(data_).get(); break;} - // case 3: {RAFT_LOG_INFO("3th"); result = std::get<3>(data_).get(); break;} - // case 4: {RAFT_LOG_INFO("4th"); result = std::get<4>(data_).get(); break;} - - // } - // RAFT_LOG_INFO("data_handle() called: data %p; cached_ptr %p\n", result, cached_ptr); - // return result; return cached_ptr; - } + } auto mem_type() const noexcept { @@ -323,10 +312,6 @@ struct buffer { ~buffer() = default; - // auto view() -> view_type { - // return make_mdspan mem_type()), is_device_accessible(this -> mem_type())>(data_, make_extents(size_)); - // } - HOST DEVICE auto view() const noexcept { if (data_.index() == 0) return std::get<0>(data_).view(); @@ -355,34 +340,34 @@ struct buffer { ElementType* cached_ptr; }; -// template -// detail::const_agnostic_same_t copy(raft::resources const& handle, -// buffer & dst, -// buffer const& src, -// size_t dst_offset, -// size_t src_offset, -// size_t size) -// { -// if constexpr (bounds_check) { -// if (src.size() - src_offset < size || dst.size() - dst_offset < size) { -// throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); -// } -// } -// auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; -// auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; -// detail::buffer_copy(handle, -// dst.data_handle() + dst_offset, -// src.data_handle() + src_offset, -// size, -// dst_device_type, -// src_device_type); -// } +template typename DstContainerPolicy, template typename SrcContainerPolicy> +detail::const_agnostic_same_t copy(raft::resources const& handle, + buffer & dst, + buffer const& src, + size_t dst_offset, + size_t src_offset, + size_t size) +{ + if constexpr (bounds_check) { + if (src.size() - src_offset < size || dst.size() - dst_offset < size) { + throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); + } + } + auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; + auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; + detail::buffer_copy(handle, + dst.data_handle() + dst_offset, + src.data_handle() + src_offset, + size, + dst_device_type, + src_device_type); +} -// template -// detail::const_agnostic_same_t copy(raft::resources const& handle, -// buffer& dst, -// buffer const& src) -// { -// copy(handle, dst, src, 0, 0, src.size()); -// } +template typename DstContainerPolicy, template typename SrcContainerPolicy> +detail::const_agnostic_same_t copy(raft::resources const& handle, + buffer& dst, + buffer const& src) +{ + copy(handle, dst, src, 0, 0, src.size()); +} } // namespace raft \ No newline at end of file diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/buffer.cpp index 8cac43f630..ac3fb679af 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/buffer.cpp @@ -194,56 +194,58 @@ TEST(Buffer, non_owning_host_buffer) // } // } -// TEST(Buffer, move_buffer) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(buffer(handle, data.data(), data.size(), memory_type::host)); -// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); -// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); -// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::host); +TEST(Buffer, move_buffer) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + auto exts = raft::make_extents(data.size()); + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::host); + test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::host); + test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::host); -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_EQ(buf.data_handle(), data.data()); + for (auto& buf : test_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_EQ(buf.data_handle(), data.data()); -// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// #ifndef RAFT_DISABLE_GPU -// test_buffers = std::vector>{}; -// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); -// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); -// test_buffers.emplace_back(handle, buffer(handle, data.data(), data.size(), memory_type::host), memory_type::device); -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::device); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data_handle(), data.data()); + auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +#ifndef RAFT_DISABLE_GPU + auto test_dev_buffers = std::vector>{}; + test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::device); + test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::device); + test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::device); + for (auto& buf : test_dev_buffers) { + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_EQ(buf.size(), data.size()); + ASSERT_NE(buf.data_handle(), data.data()); -// auto data_out = std::vector(buf.size()); -// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data_handle()), buf.size() * sizeof(int), cudaMemcpyDefault)); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// } -// #endif -// } + auto data_out = std::vector(buf.size()); + RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data_handle()), buf.size() * sizeof(int), cudaMemcpyDefault)); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); + } +#endif +} -// TEST(Buffer, move_assignment_buffer) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; +TEST(Buffer, move_assignment_buffer) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + auto exts1 = raft::make_extents(data.size() - 1); + auto exts2 = raft::make_extents(data.size()); -// #ifndef RAFT_DISABLE_GPU -// auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::device}; -// #else -// auto buf = buffer{handle, data.data(), data.size() - 1, memory_type::host}; -// #endif -// buf = buffer{handle, data.size(), memory_type::host}; +#ifndef RAFT_DISABLE_GPU + auto buf = buffer{handle, data.data(), exts1, memory_type::device}; +#else + auto buf = buffer{handle, data.data(), exts1, memory_type::host}; +#endif + buf = buffer{handle, exts2, memory_type::host}; -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// } + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_EQ(buf.size(), data.size()); +} // TEST(Buffer, partial_buffer_copy) // { @@ -313,5 +315,8 @@ TEST(Buffer, non_owning_host_buffer) // EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // #endif // } - +TEST(Buffer, view_buffer) +{ + raft::resources handle; +} } \ No newline at end of file From 238d010c5437334ee60642f613ae6cab18f40133 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 13 Jun 2023 17:25:04 -0700 Subject: [PATCH 26/75] combine owning buffer cpu/gpu --- .../detail/buffer_utils/owning_buffer.hpp | 102 +++++++++++++++++- .../buffer_utils/owning_buffer_base.hpp | 36 ------- .../detail/buffer_utils/owning_buffer_cpu.hpp | 61 ----------- .../detail/buffer_utils/owning_buffer_gpu.hpp | 62 ----------- cpp/include/raft/core/mdbuffer.hpp | 83 ++++++-------- cpp/test/CMakeLists.txt | 2 +- cpp/test/core/{buffer.cpp => mdbuffer.cpp} | 93 ++++++++-------- 7 files changed, 181 insertions(+), 258 deletions(-) delete mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp delete mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp delete mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp rename cpp/test/core/{buffer.cpp => mdbuffer.cpp} (74%) diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp index c8f8da128d..9c24ca9bab 100644 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp @@ -14,8 +14,102 @@ * limitations under the License. */ #pragma once -#include "owning_buffer_cpu.hpp" +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif +#include #include -#ifndef RAFT_DISABLE_GPU -#include "owning_buffer_gpu.hpp" -#endif \ No newline at end of file +#include +#include +#include +#include + +namespace raft { +namespace detail { + template typename ContainerPolicy> +struct owning_host_buffer { + using element_type = std::remove_cv_t; + using container_policy = std::conditional_t, ContainerPolicy>, + std::variant_alternative_t<0, buffer_container_policy>, + ContainerPolicy>; + using index_type = typename Extents::index_type; + using buffer = host_mdarray; + owning_host_buffer(raft::resources const& handle, Extents extents) noexcept(false) + : extents_{extents}, data_{[&extents, handle]() { + typename buffer::mapping_type layout{extents}; + typename buffer::container_policy_type policy{}; + buffer host_data{handle, layout, policy}; + return host_data; + }()} + { + } + + auto* get() const { return const_cast(data_.data_handle()); } + + auto view() { + return data_.view(); + } + + private: + Extents extents_; + buffer data_; +}; + +#ifndef RAFT_DISABLE_CUDA +template typename ContainerPolicy> +struct owning_device_buffer { + using element_type = std::remove_cv_t; + using container_policy = std::conditional_t, ContainerPolicy>, + std::variant_alternative_t<1, buffer_container_policy>, + ContainerPolicy>; + using index_type = typename Extents::index_type; + using buffer = device_mdarray; + + owning_device_buffer() : data_{} {} + + owning_device_buffer(raft::resources const& handle, Extents extents) noexcept(false) + : extents_{extents}, data_{[&extents, handle]() { + typename buffer::mapping_type layout{extents}; + typename buffer::container_policy_type policy{}; + buffer device_data{handle, layout, policy}; + return device_data; + }()} + { + } + + auto* get() const {return const_cast(data_.data_handle());} + + auto view() { + data_.view(); + } + private: + Extents extents_; + buffer data_; +}; +#else +template typename ContainerPolicy> +struct owning_device_buffer { + owning_device_buffer(raft::resources const& handle, Extents extents) : extents_(extents){} + auto* get() const { return static_cast(nullptr); } + + auto view() { + return host_mdspan(nullptr, exts); + } + + private: + Extents extents_; +}; +#endif +} // namespace detail +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp deleted file mode 100644 index 6b7b1e44b1..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_base.hpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include - -namespace raft { -namespace detail { - -template typename ContainerPolicy> -struct owning_buffer { - owning_buffer() {} - owning_buffer(raft::resources const& handle, Extents extents) {} - auto* get() const { return static_cast(nullptr); } -}; - -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp deleted file mode 100644 index fa8205b2ed..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_cpu.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "owning_buffer_base.hpp" -#include "raft/core/mdspan.hpp" -#include -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace detail { - template typename ContainerPolicy> -struct owning_buffer { - using element_type = std::remove_cv_t; - using container_policy = std::conditional_t, ContainerPolicy>, - std::variant_alternative_t<0, buffer_container_policy>, - ContainerPolicy>; - using index_type = typename Extents::index_type; - using owning_host_buffer = host_mdarray; - owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) - : extents_{extents}, data_{[&extents, handle]() { - typename owning_host_buffer::mapping_type layout{extents}; - typename owning_host_buffer::container_policy_type policy{}; - owning_host_buffer host_data{handle, layout, policy}; - return host_data; - }()} - { - } - - auto* get() const { return const_cast(data_.data_handle()); } - - auto view() { - return data_.view(); - } - - private: - Extents extents_; - owning_host_buffer data_; -}; -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp deleted file mode 100644 index 7ef0c86396..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer_gpu.hpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "raft/core/logger.hpp" -#include "owning_buffer_base.hpp" -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace detail { - template typename ContainerPolicy> -struct owning_buffer { - using element_type = std::remove_cv_t; - using container_policy = std::conditional_t, ContainerPolicy>, - std::variant_alternative_t<1, buffer_container_policy>, - ContainerPolicy>; - using index_type = typename Extents::index_type; - using owning_device_buffer = device_mdarray; - - owning_buffer() : data_{} {} - - owning_buffer(raft::resources const& handle, Extents extents) noexcept(false) - : extents_{extents}, data_{[&extents, handle]() { - typename owning_device_buffer::mapping_type layout{extents}; - typename owning_device_buffer::container_policy_type policy{}; - owning_device_buffer device_data{handle, layout, policy}; - return device_data; - }()} - { - } - - auto* get() const {return const_cast(data_.data_handle());} - - auto view() { - data_.view(); - } - private: - Extents extents_; - owning_device_buffer data_; -}; -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 5a7980a57a..78b4f03f9b 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -14,7 +14,6 @@ * limitations under the License. */ #pragma once -#include "raft/core/logger.hpp" #include #include #include @@ -41,27 +40,27 @@ namespace raft { * @tparam LayoutPolicy layout of the input * @tparam ContainerPolicy container to be used to own host/device memory if needed. * Users must ensure that the container has the correct type (host/device). Exceptions - * due to a device container being used for a host buffer and vice versa are not caught - * by the buffer class. + * due to a device container being used for a host mdbuffer and vice versa are not caught + * by the mdbuffer class. * @tparam the index type of the extents */ template typename ContainerPolicy = buffer_container_policy> -struct buffer { +struct mdbuffer { using data_store = std::variant, detail::non_owning_buffer, detail::non_owning_buffer, - detail::owning_buffer, - detail::owning_buffer>; + detail::owning_host_buffer, + detail::owning_device_buffer>; - buffer() : device_type_{}, data_{}, length_{0}, memory_type_{memory_type::host} {} + mdbuffer() : device_type_{}, data_{}, length_{0}, memory_type_{memory_type::host} {} - /** Construct non-initialized owning buffer. For owning buffers, managed memory is treated as + /** Construct non-initialized owning mdbuffer. For owning buffers, managed memory is treated as * device memory only. Therefore, users are discouraged from using managed memory for creating * owning buffers. */ - buffer(raft::resources const& handle, + mdbuffer(raft::resources const& handle, Extents extents, memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { @@ -71,9 +70,9 @@ struct buffer { data_{[this, mem_type, handle]() { auto result = data_store{}; if (is_device_accessible(mem_type)) { - result = detail::owning_buffer{handle, extents_}; + result = detail::owning_device_buffer{handle, extents_}; } else { - result = detail::owning_buffer{handle, extents_}; + result = detail::owning_host_buffer{handle, extents_}; } return result; }()}, @@ -96,13 +95,12 @@ struct buffer { { } - /** Construct non-owning buffer. Currently, users must ensure that the input_data is on the same device_type as the requested mem_type. + /** Construct non-owning mdbuffer. Currently, users must ensure that the input_data is on the same device_type as the requested mem_type. This cannot be asserted because checking the device id requires CUDA headers (which is against the intended cpu-gpu interop). If the mem_type is different from the device_type of input_data, the input_data should first be copied to the appropriate location. For managed memory_type, input_data should be a managed pointer. */ - buffer(raft::resources const& handle, ElementType* input_data, Extents extents, memory_type mem_type = memory_type::host) + mdbuffer(raft::resources const& handle, ElementType* input_data, Extents extents, memory_type mem_type = memory_type::host) : device_type_{[mem_type]() { - RAFT_LOG_INFO("Non owning constructor call started"); return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, extents_{extents}, @@ -127,25 +125,22 @@ struct buffer { memory_type_{mem_type}, cached_ptr{[this]() { auto result = static_cast(nullptr); - RAFT_LOG_INFO("data_index from constructor %d\n", data_.index()); switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; } - RAFT_LOG_INFO("data pointer from constructor %p\n", result); return result; }()} { - RAFT_LOG_INFO("Non owning constructor call complete"); } /** - * @brief Construct one buffer of the given memory type from another. - * A buffer constructed in this way is owning and will copy the data from + * @brief Construct one mdbuffer of the given memory type from another. + * A mdbuffer constructed in this way is owning and will copy the data from * the original location. */ - buffer(raft::resources const& handle, - buffer const& other, + mdbuffer(raft::resources const& handle, + mdbuffer const& other, memory_type mem_type) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; @@ -156,16 +151,14 @@ struct buffer { auto result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { auto buf = - detail::owning_buffer(handle, extents_); + detail::owning_device_buffer(handle, extents_); result_data = buf.get(); result = std::move(buf); - RAFT_LOG_INFO("gpu copy called"); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { - auto buf = detail::owning_buffer(handle, extents_); + auto buf = detail::owning_host_buffer(handle, extents_); result_data = buf.get(); result = std::move(buf); - RAFT_LOG_INFO("cpu copy called"); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); } return result; @@ -187,10 +180,9 @@ struct buffer { return result; }()} { - RAFT_LOG_INFO("Pointer to other's data %p\n", other.data_handle()); } - friend void swap(buffer& first, buffer& second) + friend void swap(mdbuffer& first, mdbuffer& second) { using std::swap; swap(first.device_type_, second.device_type_); @@ -199,25 +191,25 @@ struct buffer { swap(first.memory_type_, second.memory_type_); swap(first.cached_ptr, second.cached_ptr); } - buffer& operator=(buffer const& other) { + mdbuffer& operator=(mdbuffer const& other) { auto copy = other; swap(*this, copy); return *this; } /** - * @brief Create owning copy of existing buffer with given stream - * The device type of this new buffer will be the same as the original + * @brief Create owning copy of existing mdbuffer with given stream + * The device type of this new mdbuffer will be the same as the original */ - buffer(raft::resources const& handle, buffer const& other) : buffer(handle, other, other.mem_type()) + mdbuffer(raft::resources const& handle, mdbuffer const& other) : mdbuffer(handle, other, other.mem_type()) { } /** - * @brief Move from existing buffer unless a copy is necessary based on + * @brief Move from existing mdbuffer unless a copy is necessary based on * memory location */ - buffer(raft::resources const& handle, buffer&& other, memory_type mem_type) + mdbuffer(raft::resources const& handle, mdbuffer&& other, memory_type mem_type) : device_type_{[mem_type]() { return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; }()}, @@ -229,8 +221,7 @@ struct buffer { } else { auto* result_data = static_cast(nullptr); if (is_device_accessible(mem_type)) { - auto buf = detail::owning_buffer{handle, extents_}; @@ -238,8 +229,7 @@ struct buffer { result = std::move(buf); detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); } else { - auto buf = detail::owning_buffer{handle, extents_}; result_data = buf.get(); result = std::move(buf); @@ -261,10 +251,9 @@ struct buffer { return result; }()} { - RAFT_LOG_INFO("main move called"); } - buffer(buffer&& other) noexcept + mdbuffer(mdbuffer&& other) noexcept : device_type_{[&other]() { return is_device_accessible(other.mem_type()) ? device_type::gpu : device_type::cpu; }()}, @@ -288,10 +277,8 @@ struct buffer { return result; }()} { - RAFT_LOG_INFO("trivial move called"); } - buffer& operator=(buffer&& other) noexcept { - RAFT_LOG_INFO("operator= move called"); + mdbuffer& operator=(mdbuffer&& other) noexcept { device_type_ = std::move(other.device_type_); extents_ = std::move(other.extents_); data_ = std::move(other.data_); @@ -310,7 +297,7 @@ struct buffer { return memory_type_; } - ~buffer() = default; + ~mdbuffer() = default; HOST DEVICE auto view() const noexcept { if (data_.index() == 0) @@ -342,15 +329,15 @@ struct buffer { template typename DstContainerPolicy, template typename SrcContainerPolicy> detail::const_agnostic_same_t copy(raft::resources const& handle, - buffer & dst, - buffer const& src, + mdbuffer & dst, + mdbuffer const& src, size_t dst_offset, size_t src_offset, size_t size) { if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { - throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); + throw out_of_bounds("Attempted copy to or from mdbuffer of inadequate size"); } } auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; @@ -365,8 +352,8 @@ detail::const_agnostic_same_t copy(raft::resources const& handle, template typename DstContainerPolicy, template typename SrcContainerPolicy> detail::const_agnostic_same_t copy(raft::resources const& handle, - buffer& dst, - buffer const& src) + mdbuffer& dst, + mdbuffer const& src) { copy(handle, dst, src, 0, 0, src.size()); } diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index ac0b025cd5..86b001483a 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -99,7 +99,7 @@ if(BUILD_TESTS) NAME CORE_TEST PATH - test/core/buffer.cpp + test/core/mdbuffer.cpp test/core/logger.cpp test/core/math_device.cu test/core/math_host.cpp diff --git a/cpp/test/core/buffer.cpp b/cpp/test/core/mdbuffer.cpp similarity index 74% rename from cpp/test/core/buffer.cpp rename to cpp/test/core/mdbuffer.cpp index ac3fb679af..8414bf7946 100644 --- a/cpp/test/core/buffer.cpp +++ b/cpp/test/core/mdbuffer.cpp @@ -29,7 +29,7 @@ namespace raft { TEST(Buffer, default_buffer) { - auto buf = buffer>(); + auto buf = mdbuffer>(); EXPECT_EQ(buf.mem_type(), memory_type::host); EXPECT_EQ(buf.size(), 0); ASSERT_NE(buf.data_handle(), nullptr); @@ -40,7 +40,7 @@ TEST(Buffer, device_buffer) raft::resources handle; auto data = std::vector{1, 2, 3}; auto exts = raft::make_extents(data.size()); - auto test_buffers = std::vector>{}; + auto test_buffers = std::vector>{}; test_buffers.emplace_back(handle, exts, memory_type::device); test_buffers.emplace_back(handle, exts, memory_type::device); test_buffers.emplace_back(handle, exts, memory_type::device); @@ -71,7 +71,7 @@ TEST(Buffer, non_owning_device_buffer) sizeof(int) * data.size(), cudaMemcpyHostToDevice); #endif - auto test_buffers = std::vector>{}; + auto test_buffers = std::vector>{}; test_buffers.emplace_back(handle, ptr_d, exts, memory_type::device); test_buffers.emplace_back(handle, ptr_d, exts, memory_type::device); #ifndef RAFT_DISABLE_GPU @@ -98,7 +98,7 @@ TEST(Buffer, host_buffer) auto data = std::vector{1, 2, 3}; auto exts = raft::make_extents(data.size()); - auto test_buffers = std::vector>{}; + auto test_buffers = std::vector>{}; test_buffers.emplace_back(handle, exts, memory_type::host); test_buffers.emplace_back(handle, exts, memory_type::host); test_buffers.emplace_back(handle, exts, memory_type::host); @@ -122,7 +122,7 @@ TEST(Buffer, non_owning_host_buffer) raft::resources handle; auto data = std::vector{1, 2, 3}; auto exts = raft::make_extents(data.size()); - std::vector> test_buffers; + std::vector> test_buffers; test_buffers.emplace_back(handle, data.data(), exts, memory_type::host); test_buffers.emplace_back(handle, data.data(), exts, memory_type::host); test_buffers.emplace_back(handle, data.data(), exts); @@ -141,10 +141,10 @@ TEST(Buffer, non_owning_host_buffer) // { // raft::resources handle; // auto data = std::vector{1, 2, 3}; -// buffer const orig_buffer = buffer(handle, data.data(), data.size(), memory_type::host); +// mdbuffer const orig_buffer = mdbuffer(handle, data.data(), data.size(), memory_type::host); // // host to host copy operations -// auto test_buffers = std::vector>{}; +// auto test_buffers = std::vector>{}; // test_buffers.emplace_back(handle, orig_buffer); // test_buffers.emplace_back(handle, orig_buffer, memory_type::host); // test_buffers.emplace_back(handle, orig_buffer, memory_type::host); @@ -160,7 +160,7 @@ TEST(Buffer, non_owning_host_buffer) // #ifndef RAFT_DISABLE_GPU // // host to device copy operations -// auto test_dev_buffers = std::vector>{}; +// auto test_dev_buffers = std::vector>{}; // test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); // test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); // test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); @@ -170,7 +170,7 @@ TEST(Buffer, non_owning_host_buffer) // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); // // device to device copy operations -// auto test_dev_copies = std::vector>{}; +// auto test_dev_copies = std::vector>{}; // test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); // test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); // test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); @@ -181,7 +181,7 @@ TEST(Buffer, non_owning_host_buffer) // // } // // // device to host copy operations -// // auto test_host_buffers = std::vector>{}; +// // auto test_host_buffers = std::vector>{}; // // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); // // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); // // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); @@ -199,10 +199,10 @@ TEST(Buffer, move_buffer) raft::resources handle; auto data = std::vector{1, 2, 3}; auto exts = raft::make_extents(data.size()); - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::host); - test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::host); - test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::host); + auto test_buffers = std::vector>{}; + test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::host); + test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::host); + test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::host); for (auto& buf : test_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::host); @@ -213,10 +213,10 @@ TEST(Buffer, move_buffer) EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } #ifndef RAFT_DISABLE_GPU - auto test_dev_buffers = std::vector>{}; - test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::device); - test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::device); - test_buffers.emplace_back(handle, buffer(handle, data.data(), exts, memory_type::host), memory_type::device); + auto test_dev_buffers = std::vector>{}; + test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::device); + test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::device); + test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::device); for (auto& buf : test_dev_buffers) { ASSERT_EQ(buf.mem_type(), memory_type::device); ASSERT_EQ(buf.size(), data.size()); @@ -237,57 +237,58 @@ TEST(Buffer, move_assignment_buffer) auto exts2 = raft::make_extents(data.size()); #ifndef RAFT_DISABLE_GPU - auto buf = buffer{handle, data.data(), exts1, memory_type::device}; + auto buf = mdbuffer{handle, data.data(), exts1, memory_type::device}; #else - auto buf = buffer{handle, data.data(), exts1, memory_type::host}; + auto buf = mdbuffer{handle, data.data(), exts1, memory_type::host}; #endif - buf = buffer{handle, exts2, memory_type::host}; + buf = mdbuffer{handle, exts2, memory_type::host}; ASSERT_EQ(buf.mem_type(), memory_type::host); ASSERT_EQ(buf.size(), data.size()); } -// TEST(Buffer, partial_buffer_copy) -// { -// raft::resources handle; -// auto data1 = std::vector{1, 2, 3, 4, 5}; -// auto data2 = std::vector{0, 0, 0, 0, 0}; -// auto expected = std::vector{0, 3, 4, 5, 0}; -// #ifndef RAFT_DISABLE_GPU -// auto buf1 = buffer{handle, buffer{handle, data1.data(), data1.size(), memory_type::host}, memory_type::device}; -// #else -// auto buf1 = buffer{handle, data1.data(), data1.size(), memory_type::host}; -// #endif -// auto buf2 = buffer{handle, data2.data(), data2.size(), memory_type::host}; -// copy(handle, buf2, buf1, 1, 2, 3); -// copy(handle, buf2, buf1, 1, 2, 3); -// EXPECT_THROW(copy(handle, buf2, buf1, 1, 2, 4), out_of_bounds); -// } +TEST(Buffer, partial_buffer_copy) +{ + raft::resources handle; + auto data1 = std::vector{1, 2, 3, 4, 5}; + auto data2 = std::vector{0, 0, 0, 0, 0}; + auto expected = std::vector{0, 3, 4, 5, 0}; + auto exts = raft::make_extents(data1.size()); +#ifndef RAFT_DISABLE_GPU + auto buf1 = mdbuffer{handle, mdbuffer{handle, data1.data(), exts, memory_type::host}, memory_type::device}; +#else + auto buf1 = mdbuffer{handle, data1.data(), exts, memory_type::host}; +#endif + auto buf2 = mdbuffer{handle, data2.data(), exts, memory_type::host}; + copy(handle, buf2, buf1, 1, 2, 3); + copy(handle, buf2, buf1, 1, 2, 3); + EXPECT_THROW(copy(handle, buf2, buf1, 1, 2, 4), out_of_bounds); +} // TEST(Buffer, buffer_copy_overloads) // { // raft::resources handle; // auto data = std::vector{1, 2, 3}; // auto expected = data; -// auto orig_host_buffer = buffer(handle, data.data(), data.size(), memory_type::host); -// auto orig_dev_buffer = buffer(handle, orig_host_buffer, memory_type::device); -// auto copy_dev_buffer = buffer(handle, data.size(), memory_type::device); +// auto orig_host_buffer = mdbuffer(handle, data.data(), data.size(), memory_type::host); +// auto orig_dev_buffer = mdbuffer(handle, orig_host_buffer, memory_type::device); +// auto copy_dev_buffer = mdbuffer(handle, data.size(), memory_type::device); // // copying host to host // auto data_out = std::vector(data.size()); -// auto copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// auto copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); // copy(handle, copy_host_buffer, orig_host_buffer); // EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // // copying host to host with stream // data_out = std::vector(data.size()); -// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); // copy(handle, copy_host_buffer, orig_host_buffer); // EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // // copying host to host with offset // data_out = std::vector(data.size() + 1); -// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); // copy(handle, copy_host_buffer, orig_host_buffer, 2, 1, 1); // expected = std::vector{0, 0, 2, 0}; // EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); @@ -295,21 +296,21 @@ TEST(Buffer, move_assignment_buffer) // #ifndef RAFT_DISABLE_GPU // // copy device to host // data_out = std::vector(data.size()); -// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); // copy(handle, copy_host_buffer, orig_dev_buffer); // expected = data; // EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // // copy device to host with stream // data_out = std::vector(data.size()); -// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); // copy(handle, copy_host_buffer, orig_dev_buffer); // expected = data; // EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); // // copy device to host with offset // data_out = std::vector(data.size() + 1); -// copy_host_buffer = buffer(handle, data_out.data(), data.size(), memory_type::host); +// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); // copy(handle, copy_host_buffer, orig_dev_buffer, 2, 1, 1); // expected = std::vector{0, 0, 2, 0}; // EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); From 75cfcf189ff6ce8fa9ca72e84348a34600bf43a3 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 20 Jun 2023 11:49:06 -0700 Subject: [PATCH 27/75] update tests --- cpp/test/core/buffer.cu | 57 --------------- cpp/test/core/mdbuffer.cpp | 143 +++++++++---------------------------- 2 files changed, 35 insertions(+), 165 deletions(-) delete mode 100644 cpp/test/core/buffer.cu diff --git a/cpp/test/core/buffer.cu b/cpp/test/core/buffer.cu deleted file mode 100644 index d7b308b4df..0000000000 --- a/cpp/test/core/buffer.cu +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace raft { - -__global__ void check_buffer_access(int* buf) { - if (buf[0] == 1) { - buf[0] = 4; - } - if (buf[1] == 2) { - buf[1] = 5; - } - if (buf[2] == 3) { - buf[2] = 6; - } -} - -TEST(Buffer, device_buffer_access) -{ - auto data = std::vector{1, 2, 3}; - auto expected = std::vector{4, 5, 6}; - raft::resources handle; - auto buf = buffer( - handle, - buffer(handle, data.data(), data.size(), memory_type::host), - memory_type::device); - // check_buffer_access<<<1,1>>>(buf.data()); - // auto data_out = std::vector(expected.size()); - // auto host_buf = buffer(data_out.data(), data_out.size(), memory_type::host); - // copy(host_buf, buf); - // ASSERT_EQ(cudaStreamSynchronize(execution_stream{}), cudaSuccess); - // EXPECT_THAT(data_out, testing::ElementsAreArray(expected)); -} - -} \ No newline at end of file diff --git a/cpp/test/core/mdbuffer.cpp b/cpp/test/core/mdbuffer.cpp index 8414bf7946..8c6bb02ae3 100644 --- a/cpp/test/core/mdbuffer.cpp +++ b/cpp/test/core/mdbuffer.cpp @@ -137,63 +137,6 @@ TEST(Buffer, non_owning_host_buffer) } } -// TEST(Buffer, copy_constructor) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// mdbuffer const orig_buffer = mdbuffer(handle, data.data(), data.size(), memory_type::host); - -// // host to host copy operations -// auto test_buffers = std::vector>{}; -// test_buffers.emplace_back(handle, orig_buffer); -// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); -// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); -// test_buffers.emplace_back(handle, orig_buffer, memory_type::host); - -// for (auto& buf : test_buffers) { -// ASSERT_EQ(buf.mem_type(), memory_type::host); -// ASSERT_EQ(buf.size(), data.size()); -// ASSERT_NE(buf.data_handle(), orig_buffer.data_handle()); - -// auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - -// #ifndef RAFT_DISABLE_GPU -// // host to device copy operations -// auto test_dev_buffers = std::vector>{}; -// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); -// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); -// test_dev_buffers.emplace_back(handle, orig_buffer, memory_type::device); -// for (auto& dev_buf : test_dev_buffers) { -// data_out = std::vector(data.size()); -// RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(dev_buf.data_handle()), dev_buf.size() * sizeof(int), cudaMemcpyDefault)); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - -// // device to device copy operations -// auto test_dev_copies = std::vector>{}; -// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); -// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); -// test_dev_copies.emplace_back(handle, dev_buf, memory_type::device); -// // for (auto& copy_buf : test_dev_copies) { -// // data_out = std::vector(data.size()); -// // RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(copy_buf.data_handle()), copy_buf.size() * sizeof(int), cudaMemcpyDefault)); -// // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// // } - -// // // device to host copy operations -// // auto test_host_buffers = std::vector>{}; -// // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); -// // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); -// // test_host_buffers.emplace_back(handle, dev_buf, memory_type::host); -// // for (auto& host_buf : test_host_buffers) { -// // data_out = std::vector(host_buf.data_handle(), host_buf.data_handle() + host_buf.size()); -// // EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); -// // } -// } -// #endif -// } -// } - TEST(Buffer, move_buffer) { raft::resources handle; @@ -265,59 +208,43 @@ TEST(Buffer, partial_buffer_copy) EXPECT_THROW(copy(handle, buf2, buf1, 1, 2, 4), out_of_bounds); } -// TEST(Buffer, buffer_copy_overloads) -// { -// raft::resources handle; -// auto data = std::vector{1, 2, 3}; -// auto expected = data; -// auto orig_host_buffer = mdbuffer(handle, data.data(), data.size(), memory_type::host); -// auto orig_dev_buffer = mdbuffer(handle, orig_host_buffer, memory_type::device); -// auto copy_dev_buffer = mdbuffer(handle, data.size(), memory_type::device); +TEST(Buffer, buffer_copy_overloads) +{ + raft::resources handle; + auto data = std::vector{1, 2, 3}; + auto expected = data; + auto exts = raft::make_extents(data.size()); + auto orig_host_buffer = mdbuffer(handle, data.data(), exts, memory_type::host); + auto orig_dev_buffer = mdbuffer(handle, orig_host_buffer, memory_type::device); + auto copy_dev_buffer = mdbuffer(handle, exts, memory_type::device); -// // copying host to host -// auto data_out = std::vector(data.size()); -// auto copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); -// copy(handle, copy_host_buffer, orig_host_buffer); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -// // copying host to host with stream -// data_out = std::vector(data.size()); -// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); -// copy(handle, copy_host_buffer, orig_host_buffer); -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -// // copying host to host with offset -// data_out = std::vector(data.size() + 1); -// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); -// copy(handle, copy_host_buffer, orig_host_buffer, 2, 1, 1); -// expected = std::vector{0, 0, 2, 0}; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -// #ifndef RAFT_DISABLE_GPU -// // copy device to host -// data_out = std::vector(data.size()); -// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); -// copy(handle, copy_host_buffer, orig_dev_buffer); -// expected = data; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + // copying host to host + auto data_out = std::vector(data.size()); + auto copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); + copy(handle, copy_host_buffer, orig_host_buffer); + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); + + // copying host to host with offset + data_out = std::vector(data.size() + 1); + copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); + copy(handle, copy_host_buffer, orig_host_buffer, 2, 1, 1); + expected = std::vector{0, 0, 2, 0}; + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); -// // copy device to host with stream -// data_out = std::vector(data.size()); -// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); -// copy(handle, copy_host_buffer, orig_dev_buffer); -// expected = data; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); +#ifndef RAFT_DISABLE_GPU + // copy device to host + data_out = std::vector(data.size()); + copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); + copy(handle, copy_host_buffer, orig_dev_buffer); + expected = data; + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); -// // copy device to host with offset -// data_out = std::vector(data.size() + 1); -// copy_host_buffer = mdbuffer(handle, data_out.data(), data.size(), memory_type::host); -// copy(handle, copy_host_buffer, orig_dev_buffer, 2, 1, 1); -// expected = std::vector{0, 0, 2, 0}; -// EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); -// #endif -// } -TEST(Buffer, view_buffer) -{ - raft::resources handle; + // copy device to host with offset + data_out = std::vector(data.size() + 1); + copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); + copy(handle, copy_host_buffer, orig_dev_buffer, 2, 1, 1); + expected = std::vector{0, 0, 2, 0}; + EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); +#endif } } \ No newline at end of file From 7b1909fd767bf123f2b94ca7832924d90d34c765 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 3 Jul 2023 10:25:36 -0700 Subject: [PATCH 28/75] Updates --- .../core/detail/buffer_utils/copy_gpu.hpp | 21 ++++++++++--------- cpp/include/raft/core/mdbuffer.hpp | 1 + 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp index f1f4d8b102..fed47d5bd4 100644 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp @@ -43,16 +43,17 @@ std::enable_if_t< void> copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) { - if (src_type == device_type::cpu) { - raft::update_device(dst, src, size, raft::resource::get_cuda_stream(handle)); - } - else if (dst_type == device_type::cpu) { - raft::update_host(dst, src, size, raft::resource::get_cuda_stream(handle)); - cudaDeviceSynchronize(); - } - else { - raft::copy_async(dst, src, size, raft::resource::get_cuda_stream(handle)); - } + // if (src_type == device_type::cpu) { + // raft::update_device(dst, src, size, raft::resource::get_cuda_stream(handle)); + // } + // else if (dst_type == device_type::cpu) { + // raft::update_host(dst, src, size, raft::resource::get_cuda_stream(handle)); + // cudaDeviceSynchronize(); + // } + // else { + // raft::copy_async(dst, src, size, raft::resource::get_cuda_stream(handle)); + // } + raft::copy(dst, src, size, raft::resource::get_cuda_stream(handle)); } } // namespace detail diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 78b4f03f9b..6dbbcfec69 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -128,6 +128,7 @@ struct mdbuffer { switch (data_.index()) { case 0: result = std::get<0>(data_).get(); break; case 1: result = std::get<1>(data_).get(); break; + case 2: result = std::get<1>(data_).get(); break; } return result; }()} From 1a1143ffb301422fb542382ae79a8e1b195999e2 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 3 Jul 2023 14:43:03 -0400 Subject: [PATCH 29/75] Temporarily remove new files to bring back necessary ones --- cpp/CMakeLists.txt | 8 - .../raft/core/buffer_container_policy.hpp | 32 -- .../core/detail/buffer_utils/buffer_copy.hpp | 82 ---- .../core/detail/buffer_utils/copy_cpu.hpp | 47 --- .../core/detail/buffer_utils/copy_gpu.hpp | 60 --- .../detail/buffer_utils/non_owning_buffer.hpp | 56 --- .../detail/buffer_utils/owning_buffer.hpp | 115 ------ .../raft/core/detail/const_agnostic.hpp | 27 -- cpp/include/raft/core/device_support.hpp | 44 --- cpp/include/raft/core/device_type.hpp | 26 -- cpp/include/raft/core/error.hpp | 24 -- cpp/include/raft/core/mdbuffer.hpp | 361 ------------------ cpp/test/CMakeLists.txt | 1 - cpp/test/core/mdbuffer.cpp | 250 ------------ 14 files changed, 1133 deletions(-) delete mode 100644 cpp/include/raft/core/buffer_container_policy.hpp delete mode 100644 cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp delete mode 100644 cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp delete mode 100644 cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp delete mode 100644 cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp delete mode 100644 cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp delete mode 100644 cpp/include/raft/core/detail/const_agnostic.hpp delete mode 100644 cpp/include/raft/core/device_support.hpp delete mode 100644 cpp/include/raft/core/device_type.hpp delete mode 100644 cpp/include/raft/core/mdbuffer.hpp delete mode 100644 cpp/test/core/mdbuffer.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ad5a6cd833..6fa1b5830e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -56,7 +56,6 @@ option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and librari option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF) option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) -option(DISABLE_CUDA "Disable CUDA in supported RAFT code" OFF) option(DISABLE_OPENMP "Disable OpenMP" OFF) option(RAFT_NVTX "Enable nvtx markers" OFF) @@ -247,13 +246,6 @@ target_compile_definitions(raft::raft INTERFACE $<$:NVTX_ENAB ) endif() -############################################################################## -# - CUDA-free build support -------------------------------------------------- - -if (DISABLE_CUDA) - target_compile_definitions(raft INTERFACE RAFT_DISABLE_GPU) -endif() - # ################################################################################################## # * raft_compiled ------------------------------------------------------------ TODO: Currently, this # package also contains the 'random' namespace (for rmat logic) We couldn't get this to work diff --git a/cpp/include/raft/core/buffer_container_policy.hpp b/cpp/include/raft/core/buffer_container_policy.hpp deleted file mode 100644 index 55712cf55d..0000000000 --- a/cpp/include/raft/core/buffer_container_policy.hpp +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#ifndef RAFT_DISABLE_GPU -#include -#endif - -namespace raft { -#ifdef RAFT_DISABLE_GPU -template -using buffer_container_policy = std::variant>; -#else -template -using buffer_container_policy = std::variant, raft::device_uvector_policy>; -#endif -} \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp b/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp deleted file mode 100644 index 3ec58d65a5..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/buffer_copy.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#ifndef RAFT_DISABLE_GPU -#include -#endif -#include -#include -namespace raft { -namespace detail { -template -void buffer_copy(raft::resources const& handle, - T* dst, - T const* src, - uint32_t size, - uint32_t dst_offset, - uint32_t src_offset) -{ - copy(handle, dst + dst_offset, src + src_offset, size); -} - -template -void buffer_copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) -{ - copy(handle, dst, src, size); -} - -template -void buffer_copy(raft::resources const& handle, - T* dst, - T const* src, - uint32_t size, - device_type dst_type, - device_type src_type, - uint32_t dst_offset, - uint32_t src_offset) -{ - if (dst_type == device_type::gpu && src_type == device_type::gpu) { - copy( - handle, dst + dst_offset, src + src_offset, size); - } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { - copy( - handle, dst + dst_offset, src + src_offset, size); - } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { - raft::print_device_vector("dst_1", dst + dst_offset, size, std::cout); - copy( - handle, dst + dst_offset, src + src_offset, size); - raft::print_device_vector("dst_2", dst + dst_offset, size, std::cout); - } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { - copy( - handle, dst + dst_offset, src + src_offset, size); - } -} - -template -void buffer_copy(raft::resources const& handle, - T* dst, - T const* src, - uint32_t size, - device_type dst_type, - device_type src_type) -{ - buffer_copy(handle, dst, src, size, dst_type, src_type, 0, 0); -} -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp deleted file mode 100644 index e2b0280ec8..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/copy_cpu.hpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#include -#include - -namespace raft { -namespace detail { - -template -std::enable_if_t, - std::bool_constant>, - void> -copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) -{ - std::copy(src, src + size, dst); -} - -template -std::enable_if_t< - std::conjunction_v, - std::bool_constant>, - std::bool_constant>, - void> -copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) -{ - throw raft::cuda_unsupported("Copying from or to device in non-GPU build"); -} - -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp b/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp deleted file mode 100644 index fed47d5bd4..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/copy_gpu.hpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "raft/core/resource/cuda_stream.hpp" -#include "thrust/detail/raw_pointer_cast.h" -#include "thrust/detail/tuple.inl" -#include "thrust/iterator/zip_iterator.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace detail { - -template -std::enable_if_t< - std::conjunction_v, - std::bool_constant>, - std::bool_constant>, - void> -copy(raft::resources const& handle, T* dst, T const* src, uint32_t size) -{ - // if (src_type == device_type::cpu) { - // raft::update_device(dst, src, size, raft::resource::get_cuda_stream(handle)); - // } - // else if (dst_type == device_type::cpu) { - // raft::update_host(dst, src, size, raft::resource::get_cuda_stream(handle)); - // cudaDeviceSynchronize(); - // } - // else { - // raft::copy_async(dst, src, size, raft::resource::get_cuda_stream(handle)); - // } - raft::copy(dst, src, size, raft::resource::get_cuda_stream(handle)); -} - -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp deleted file mode 100644 index a5c9244a00..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/non_owning_buffer.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "raft/core/buffer_container_policy.hpp" -#include "raft/core/host_container_policy.hpp" -#include "raft/core/host_device_accessor.hpp" -#include -#include -#include - -namespace raft { -namespace detail { -template typename ContainerPolicy = buffer_container_policy> -struct non_owning_buffer { - using container_policy = std::conditional_t, ContainerPolicy>, - std::variant_alternative_t<0, buffer_container_policy>, - ContainerPolicy>; - using accessor_policy = typename container_policy::accessor_policy; - using index_type = typename Extents::index_type; - - non_owning_buffer() : data_{nullptr} {} - - non_owning_buffer(ElementType* ptr, Extents extents) : data_{ptr}, extents_{extents} { - } - - auto* get() const { return data_; } - - auto view() { - using accessor_type = host_device_accessor< - accessor_policy, M>(); - return mdspan{data_, extents_}; - } - private: - ElementType* data_; - Extents extents_; -}; - -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp b/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp deleted file mode 100644 index 9c24ca9bab..0000000000 --- a/cpp/include/raft/core/detail/buffer_utils/owning_buffer.hpp +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#ifndef RAFT_DISABLE_CUDA -#include -#endif -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace detail { - template typename ContainerPolicy> -struct owning_host_buffer { - using element_type = std::remove_cv_t; - using container_policy = std::conditional_t, ContainerPolicy>, - std::variant_alternative_t<0, buffer_container_policy>, - ContainerPolicy>; - using index_type = typename Extents::index_type; - using buffer = host_mdarray; - owning_host_buffer(raft::resources const& handle, Extents extents) noexcept(false) - : extents_{extents}, data_{[&extents, handle]() { - typename buffer::mapping_type layout{extents}; - typename buffer::container_policy_type policy{}; - buffer host_data{handle, layout, policy}; - return host_data; - }()} - { - } - - auto* get() const { return const_cast(data_.data_handle()); } - - auto view() { - return data_.view(); - } - - private: - Extents extents_; - buffer data_; -}; - -#ifndef RAFT_DISABLE_CUDA -template typename ContainerPolicy> -struct owning_device_buffer { - using element_type = std::remove_cv_t; - using container_policy = std::conditional_t, ContainerPolicy>, - std::variant_alternative_t<1, buffer_container_policy>, - ContainerPolicy>; - using index_type = typename Extents::index_type; - using buffer = device_mdarray; - - owning_device_buffer() : data_{} {} - - owning_device_buffer(raft::resources const& handle, Extents extents) noexcept(false) - : extents_{extents}, data_{[&extents, handle]() { - typename buffer::mapping_type layout{extents}; - typename buffer::container_policy_type policy{}; - buffer device_data{handle, layout, policy}; - return device_data; - }()} - { - } - - auto* get() const {return const_cast(data_.data_handle());} - - auto view() { - data_.view(); - } - private: - Extents extents_; - buffer data_; -}; -#else -template typename ContainerPolicy> -struct owning_device_buffer { - owning_device_buffer(raft::resources const& handle, Extents extents) : extents_(extents){} - auto* get() const { return static_cast(nullptr); } - - auto view() { - return host_mdspan(nullptr, exts); - } - - private: - Extents extents_; -}; -#endif -} // namespace detail -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/const_agnostic.hpp b/cpp/include/raft/core/detail/const_agnostic.hpp deleted file mode 100644 index 85e99806b6..0000000000 --- a/cpp/include/raft/core/detail/const_agnostic.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include - -namespace raft::detail { -template -using const_agnostic_same_t = - std::enable_if_t, std::remove_const_t>, V>; - -template -inline constexpr auto const_agnostic_same_v = - std::is_same_v, std::remove_const_t>; -} // namespace raft::detail diff --git a/cpp/include/raft/core/device_support.hpp b/cpp/include/raft/core/device_support.hpp deleted file mode 100644 index c27fd12c5f..0000000000 --- a/cpp/include/raft/core/device_support.hpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include - -namespace raft { -#ifndef RAFT_DISABLE_GPU -auto constexpr static const CUDA_ENABLED = true; -#else -auto constexpr static const CUDA_ENABLED = false; -#endif - -#ifdef __CUDACC__ -#define HOST __host__ -#define DEVICE __device__ -auto constexpr static const GPU_COMPILATION = true; -#else -#define HOST -#define DEVICE -auto constexpr static const GPU_COMPILATION = false; -#endif - -#ifndef DEBUG -auto constexpr static const DEBUG_ENABLED = false; -#elif DEBUG == 0 -auto constexpr static const DEBUG_ENABLED = false; -#else -auto constexpr static const DEBUG_ENABLED = true; -#endif -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/device_type.hpp b/cpp/include/raft/core/device_type.hpp deleted file mode 100644 index a411c8bef7..0000000000 --- a/cpp/include/raft/core/device_type.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -namespace raft { -enum class device_type { cpu, gpu }; - -auto constexpr is_compatible(device_type dev_type, memory_type mem_type) -{ - return (dev_type == device_type::gpu && is_device_accessible(mem_type)) || - (dev_type == device_type::cpu && is_host_accessible(mem_type)); -} -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 1fe62a8056..84b244f4dc 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -102,30 +102,6 @@ struct logic_error : public raft::exception { * @} */ -struct bad_cuda_call : logic_error { - bad_cuda_call() : bad_cuda_call("CUDA API call failed") {} - explicit bad_cuda_call(char const* msg) : logic_error(msg) {} -}; - -struct cuda_unsupported : logic_error { - cuda_unsupported() : cuda_unsupported("CUDA functionality invoked in non-CUDA build") {} - explicit cuda_unsupported(char const* msg) : logic_error(msg) {} -}; - -struct out_of_bounds : logic_error { - out_of_bounds() : out_of_bounds("Attempted out-of-bounds memory access") {} - explicit out_of_bounds(char const* msg) : logic_error(msg) {} -}; - -struct wrong_device_type : logic_error { - wrong_device_type() : wrong_device_type("Attempted to use host data on GPU or device data on CPU") {} - explicit wrong_device_type(char const* msg) : logic_error(msg) {} -}; - -struct mem_type_mismatch : logic_error { - mem_type_mismatch() : mem_type_mismatch("Memory type does not match expected type") {} - explicit mem_type_mismatch(char const* msg) : logic_error(msg) {} -}; } // namespace raft // FIXME: Need to be replaced with RAFT_FAIL diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp deleted file mode 100644 index 6dbbcfec69..0000000000 --- a/cpp/include/raft/core/mdbuffer.hpp +++ /dev/null @@ -1,361 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace raft { -/** - * @brief A container which may or may not own its own data on host or device - * - * @tparam ElementType type of the input - * @tparam LayoutPolicy layout of the input - * @tparam ContainerPolicy container to be used to own host/device memory if needed. - * Users must ensure that the container has the correct type (host/device). Exceptions - * due to a device container being used for a host mdbuffer and vice versa are not caught - * by the mdbuffer class. - * @tparam the index type of the extents - */ -template typename ContainerPolicy = buffer_container_policy> -struct mdbuffer { - using data_store = std::variant, - detail::non_owning_buffer, - detail::non_owning_buffer, - detail::owning_host_buffer, - detail::owning_device_buffer>; - - mdbuffer() : device_type_{}, data_{}, length_{0}, memory_type_{memory_type::host} {} - - /** Construct non-initialized owning mdbuffer. For owning buffers, managed memory is treated as - * device memory only. Therefore, users are discouraged from using managed memory for creating - * owning buffers. */ - mdbuffer(raft::resources const& handle, - Extents extents, - memory_type mem_type = memory_type::host) - : device_type_{[mem_type]() { - return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; - }()}, - extents_{extents}, - data_{[this, mem_type, handle]() { - auto result = data_store{}; - if (is_device_accessible(mem_type)) { - result = detail::owning_device_buffer{handle, extents_}; - } else { - result = detail::owning_host_buffer{handle, extents_}; - } - return result; - }()}, - length_([this]() { - size_t length = 1; - for (size_t i = 0; i < extents_.rank(); ++i) { - length *= extents_.extent(i); - } - return length; - }()), - memory_type_{mem_type}, - cached_ptr{[this]() { - auto result = static_cast(nullptr); - switch (data_.index()) { - case 3: result = std::get<3>(data_).get(); break; - case 4: result = std::get<4>(data_).get(); break; - } - return result; - }()} - { - } - - /** Construct non-owning mdbuffer. Currently, users must ensure that the input_data is on the same device_type as the requested mem_type. - This cannot be asserted because checking the device id requires CUDA headers (which is against the intended cpu-gpu interop). If - the mem_type is different from the device_type of input_data, the input_data should first be copied to the appropriate location. For - managed memory_type, input_data should be a managed pointer. */ - mdbuffer(raft::resources const& handle, ElementType* input_data, Extents extents, memory_type mem_type = memory_type::host) - : device_type_{[mem_type]() { - return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; - }()}, - extents_{extents}, - data_{[this, input_data, mem_type]() { - auto result = data_store{}; - if (is_host_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data, extents_}; - } else if (is_device_accessible(mem_type)) { - result = detail::non_owning_buffer{input_data, extents_}; - } else { - result = detail::non_owning_buffer{input_data, extents_}; - } - return result; - }()}, - length_([this]() { - std::size_t length = 1; - for (std::size_t i = 0; i < extents_.rank(); ++i) { - length *= extents_.extent(i); - } - return length; - }()), - memory_type_{mem_type}, - cached_ptr{[this]() { - auto result = static_cast(nullptr); - switch (data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<1>(data_).get(); break; - } - return result; - }()} - { - } - - /** - * @brief Construct one mdbuffer of the given memory type from another. - * A mdbuffer constructed in this way is owning and will copy the data from - * the original location. - */ - mdbuffer(raft::resources const& handle, - mdbuffer const& other, - memory_type mem_type) - : device_type_{[mem_type]() { - return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; - }()}, - extents_{other.extents()}, - data_{[this, &other, mem_type, handle]() { - auto result = data_store{}; - auto result_data = static_cast(nullptr); - if (is_device_accessible(mem_type)) { - auto buf = - detail::owning_device_buffer(handle, extents_); - result_data = buf.get(); - result = std::move(buf); - detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); - } else { - auto buf = detail::owning_host_buffer(handle, extents_); - result_data = buf.get(); - result = std::move(buf); - detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); - } - return result; - }()}, - length_([this]() { - std::size_t length = 1; - for (std::size_t i = 0; i < extents_.rank(); ++i) { - length *= extents_.extent(i); - } - return length; - }()), - memory_type_{mem_type}, - cached_ptr{[this]() { - auto result = static_cast(nullptr); - switch (data_.index()) { - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - } - return result; - }()} - { - } - - friend void swap(mdbuffer& first, mdbuffer& second) - { - using std::swap; - swap(first.device_type_, second.device_type_); - swap(first.data_, second.data_); - swap(first.size_, second.size_); - swap(first.memory_type_, second.memory_type_); - swap(first.cached_ptr, second.cached_ptr); - } - mdbuffer& operator=(mdbuffer const& other) { - auto copy = other; - swap(*this, copy); - return *this; - } - - /** - * @brief Create owning copy of existing mdbuffer with given stream - * The device type of this new mdbuffer will be the same as the original - */ - mdbuffer(raft::resources const& handle, mdbuffer const& other) : mdbuffer(handle, other, other.mem_type()) - { - } - - /** - * @brief Move from existing mdbuffer unless a copy is necessary based on - * memory location - */ - mdbuffer(raft::resources const& handle, mdbuffer&& other, memory_type mem_type) - : device_type_{[mem_type]() { - return is_device_accessible(mem_type) ? device_type::gpu : device_type::cpu; - }()}, - extents_{other.extents()}, - data_{[&other, mem_type, handle, this]() { - auto result = data_store{}; - if (mem_type == other.mem_type()) { - result = std::move(other.data_); - } else { - auto* result_data = static_cast(nullptr); - if (is_device_accessible(mem_type)) { - auto buf = detail::owning_device_buffer{handle, extents_}; - result_data = buf.get(); - result = std::move(buf); - detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::gpu, other.dev_type()); - } else { - auto buf = detail::owning_host_buffer{handle, extents_}; - result_data = buf.get(); - result = std::move(buf); - detail::buffer_copy(handle, result_data, other.data_handle(), other.size(), device_type::cpu, other.dev_type()); - } - } - return result; - }()}, - memory_type_{mem_type}, - cached_ptr{[this]() { - auto result = static_cast(nullptr); - switch (data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - case 4: result = std::get<4>(data_).get(); break; - } - return result; - }()} - { - } - - mdbuffer(mdbuffer&& other) noexcept - : device_type_{[&other]() { - return is_device_accessible(other.mem_type()) ? device_type::gpu : device_type::cpu; - }()}, - extents_{other.extents_}, - data_{[&other]() { - auto result = data_store{}; - result = std::move(other.data_); - return result; - }()}, - length_{other.length_}, - memory_type_{other.mem_type()}, - cached_ptr{[this]() { - auto result = static_cast(nullptr); - switch (data_.index()) { - case 0: result = std::get<0>(data_).get(); break; - case 1: result = std::get<1>(data_).get(); break; - case 2: result = std::get<2>(data_).get(); break; - case 3: result = std::get<3>(data_).get(); break; - case 4: result = std::get<4>(data_).get(); break; - } - return result; - }()} - { - } - mdbuffer& operator=(mdbuffer&& other) noexcept { - device_type_ = std::move(other.device_type_); - extents_ = std::move(other.extents_); - data_ = std::move(other.data_); - length_ = std::move(other.size()); - memory_type_ = std::move(other.memory_type_); - cached_ptr = std::move(other.cached_ptr); - return *this; - } - auto extents() const noexcept { return extents_; } - HOST DEVICE auto* data_handle() const noexcept { - return cached_ptr; - } - - auto mem_type() const noexcept - { - return memory_type_; - } - - ~mdbuffer() = default; - - HOST DEVICE auto view() const noexcept { - if (data_.index() == 0) - return std::get<0>(data_).view(); - if (data_.index() == 1) - return std::get<1>(data_).view(); - if (data_.index() == 2) - return std::get<2>(data_).view(); - if (data_.index() == 3) - return std::get<3>(data_).view(); - if (data_.index() == 4) - return std::get<4>(data_).view(); - } - - auto size() {return length_;} - private: - auto dev_type() const noexcept - { - return device_type_; - } - - enum device_type device_type_; - Extents extents_; - data_store data_; - size_t length_; - enum memory_type memory_type_; - ElementType* cached_ptr; -}; - -template typename DstContainerPolicy, template typename SrcContainerPolicy> -detail::const_agnostic_same_t copy(raft::resources const& handle, - mdbuffer & dst, - mdbuffer const& src, - size_t dst_offset, - size_t src_offset, - size_t size) -{ - if constexpr (bounds_check) { - if (src.size() - src_offset < size || dst.size() - dst_offset < size) { - throw out_of_bounds("Attempted copy to or from mdbuffer of inadequate size"); - } - } - auto src_device_type = is_device_accessible(src.mem_type()) ? device_type::gpu : device_type::cpu; - auto dst_device_type = is_device_accessible(dst.mem_type()) ? device_type::gpu : device_type::cpu; - detail::buffer_copy(handle, - dst.data_handle() + dst_offset, - src.data_handle() + src_offset, - size, - dst_device_type, - src_device_type); -} - -template typename DstContainerPolicy, template typename SrcContainerPolicy> -detail::const_agnostic_same_t copy(raft::resources const& handle, - mdbuffer& dst, - mdbuffer const& src) -{ - copy(handle, dst, src, 0, 0, src.size()); -} -} // namespace raft \ No newline at end of file diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 86b001483a..33d4dd9423 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -99,7 +99,6 @@ if(BUILD_TESTS) NAME CORE_TEST PATH - test/core/mdbuffer.cpp test/core/logger.cpp test/core/math_device.cu test/core/math_host.cpp diff --git a/cpp/test/core/mdbuffer.cpp b/cpp/test/core/mdbuffer.cpp deleted file mode 100644 index 8c6bb02ae3..0000000000 --- a/cpp/test/core/mdbuffer.cpp +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace raft { - -TEST(Buffer, default_buffer) -{ - auto buf = mdbuffer>(); - EXPECT_EQ(buf.mem_type(), memory_type::host); - EXPECT_EQ(buf.size(), 0); - ASSERT_NE(buf.data_handle(), nullptr); -} - -TEST(Buffer, device_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto exts = raft::make_extents(data.size()); - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, exts, memory_type::device); - test_buffers.emplace_back(handle, exts, memory_type::device); - test_buffers.emplace_back(handle, exts, memory_type::device); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_EQ(buf.size(), data.size()); -#ifndef RAFT_DISABLE_GPU - ASSERT_NE(buf.data_handle(), nullptr); - auto data_out = std::vector(data.size()); - raft::update_device(buf.data_handle(), data.data(), data.size(), raft::resource::get_cuda_stream(handle)); - raft::update_host(data_out.data(), buf.data_handle(), buf.size(), raft::resource::get_cuda_stream(handle)); - EXPECT_THAT(data_out, testing::ElementsAreArray(data)); -#endif - } -} - -TEST(Buffer, non_owning_device_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto exts = raft::make_extents(data.size()); - auto* ptr_d = static_cast(nullptr); -#ifndef RAFT_DISABLE_GPU - cudaMalloc(reinterpret_cast(&ptr_d), sizeof(int) * data.size()); - cudaMemcpy(static_cast(ptr_d), - static_cast(data.data()), - sizeof(int) * data.size(), - cudaMemcpyHostToDevice); -#endif - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, ptr_d, exts, memory_type::device); - test_buffers.emplace_back(handle, ptr_d, exts, memory_type::device); -#ifndef RAFT_DISABLE_GPU - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data_handle(), ptr_d); - - auto data_out = std::vector(data.size()); - cudaMemcpy(static_cast(data_out.data()), - static_cast(buf.data_handle()), - sizeof(int) * data.size(), - cudaMemcpyDeviceToHost); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } - cudaFree(reinterpret_cast(ptr_d)); -#endif -} - -TEST(Buffer, host_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto exts = raft::make_extents(data.size()); - - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, exts, memory_type::host); - test_buffers.emplace_back(handle, exts, memory_type::host); - test_buffers.emplace_back(handle, exts, memory_type::host); - test_buffers.emplace_back(handle, exts); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data_handle(), nullptr); - - std::memcpy( - static_cast(buf.data_handle()), static_cast(data.data()), data.size() * sizeof(int)); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -} - -TEST(Buffer, non_owning_host_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto exts = raft::make_extents(data.size()); - std::vector> test_buffers; - test_buffers.emplace_back(handle, data.data(), exts, memory_type::host); - test_buffers.emplace_back(handle, data.data(), exts, memory_type::host); - test_buffers.emplace_back(handle, data.data(), exts); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data_handle(), data.data()); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -} - -TEST(Buffer, move_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto exts = raft::make_extents(data.size()); - auto test_buffers = std::vector>{}; - test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::host); - test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::host); - test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::host); - - for (auto& buf : test_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_EQ(buf.data_handle(), data.data()); - - auto data_out = std::vector(buf.data_handle(), buf.data_handle() + buf.size()); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -#ifndef RAFT_DISABLE_GPU - auto test_dev_buffers = std::vector>{}; - test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::device); - test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::device); - test_buffers.emplace_back(handle, mdbuffer(handle, data.data(), exts, memory_type::host), memory_type::device); - for (auto& buf : test_dev_buffers) { - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_EQ(buf.size(), data.size()); - ASSERT_NE(buf.data_handle(), data.data()); - - auto data_out = std::vector(buf.size()); - RAFT_CUDA_TRY(cudaMemcpy(static_cast(data_out.data()), static_cast(buf.data_handle()), buf.size() * sizeof(int), cudaMemcpyDefault)); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); - } -#endif -} - -TEST(Buffer, move_assignment_buffer) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto exts1 = raft::make_extents(data.size() - 1); - auto exts2 = raft::make_extents(data.size()); - -#ifndef RAFT_DISABLE_GPU - auto buf = mdbuffer{handle, data.data(), exts1, memory_type::device}; -#else - auto buf = mdbuffer{handle, data.data(), exts1, memory_type::host}; -#endif - buf = mdbuffer{handle, exts2, memory_type::host}; - - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_EQ(buf.size(), data.size()); -} - -TEST(Buffer, partial_buffer_copy) -{ - raft::resources handle; - auto data1 = std::vector{1, 2, 3, 4, 5}; - auto data2 = std::vector{0, 0, 0, 0, 0}; - auto expected = std::vector{0, 3, 4, 5, 0}; - auto exts = raft::make_extents(data1.size()); -#ifndef RAFT_DISABLE_GPU - auto buf1 = mdbuffer{handle, mdbuffer{handle, data1.data(), exts, memory_type::host}, memory_type::device}; -#else - auto buf1 = mdbuffer{handle, data1.data(), exts, memory_type::host}; -#endif - auto buf2 = mdbuffer{handle, data2.data(), exts, memory_type::host}; - copy(handle, buf2, buf1, 1, 2, 3); - copy(handle, buf2, buf1, 1, 2, 3); - EXPECT_THROW(copy(handle, buf2, buf1, 1, 2, 4), out_of_bounds); -} - -TEST(Buffer, buffer_copy_overloads) -{ - raft::resources handle; - auto data = std::vector{1, 2, 3}; - auto expected = data; - auto exts = raft::make_extents(data.size()); - auto orig_host_buffer = mdbuffer(handle, data.data(), exts, memory_type::host); - auto orig_dev_buffer = mdbuffer(handle, orig_host_buffer, memory_type::device); - auto copy_dev_buffer = mdbuffer(handle, exts, memory_type::device); - - // copying host to host - auto data_out = std::vector(data.size()); - auto copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); - copy(handle, copy_host_buffer, orig_host_buffer); - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - - // copying host to host with offset - data_out = std::vector(data.size() + 1); - copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); - copy(handle, copy_host_buffer, orig_host_buffer, 2, 1, 1); - expected = std::vector{0, 0, 2, 0}; - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - -#ifndef RAFT_DISABLE_GPU - // copy device to host - data_out = std::vector(data.size()); - copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); - copy(handle, copy_host_buffer, orig_dev_buffer); - expected = data; - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); - - // copy device to host with offset - data_out = std::vector(data.size() + 1); - copy_host_buffer = mdbuffer(handle, data_out.data(), exts, memory_type::host); - copy(handle, copy_host_buffer, orig_dev_buffer, 2, 1, 1); - expected = std::vector{0, 0, 2, 0}; - EXPECT_THAT(data_out, ::testing::ElementsAreArray(expected)); -#endif -} -} \ No newline at end of file From acceb618a757729846115bcd25eb695709a85997 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 4 Jul 2023 22:25:45 -0400 Subject: [PATCH 30/75] Begin refactoring buffer container policies --- cpp/include/raft/core/error.hpp | 10 +++ cpp/include/raft/core/mdbuffer.hpp | 118 +++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 cpp/include/raft/core/mdbuffer.hpp diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 84b244f4dc..2b0c0fe51c 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -98,6 +98,16 @@ struct logic_error : public raft::exception { explicit logic_error(std::string const& message) : raft::exception(message) {} }; +/** + * @brief Exception thrown when attempting to use CUDA features from a non-CUDA + * build + * + */ +struct non_cuda_build_error : public raft::exception { + explicit non_cuda_build_error(char const* const message) : raft::exception(message) {} + explicit non_cuda_build_error(std::string const& message) : raft::exception(message) {} +}; + /** * @} */ diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp new file mode 100644 index 0000000000..18c738acb7 --- /dev/null +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace raft { + +namespace detail { +#ifdef RAFT_DISABLE_CUDA +using buffer_stream_view = rmm::cuda_stream_view; +#else +struct buffer_stream_view { + auto value() const { + throw non_cuda_build_error{ + "Attempted to access CUDA stream in non-CUDA build" + }; + } + [[nodiscard]] auto is_per_thread_default() const { + throw non_cuda_build_error{ + "Attempted to access CUDA stream in non-CUDA build" + }; + return false; + } + [[nodiscard]] auto is_default() const { + throw non_cuda_build_error{ + "Attempted to access CUDA stream in non-CUDA build" + }; + return false; + } + void synchronize() const { + throw non_cuda_build_error{ + "Attempted to sync CUDA stream in non-CUDA build" + }; + } + + void synchronize_no_throw() const { + RAFT_LOG_ERROR( + "Attempted to sync CUDA stream in non-CUDA build" + ); + } +}; +#endif +} + +template +struct fail_container { + using pointer = T*; + using const_pointer = T const*; + + using reference = T&; + using const_reference = T const&; + + using iterator = pointer; + using const_iterator = const_pointer; + + explicit fail_container(size_t n=size_t{}) { + if (n != size_t{}) { + throw non_cuda_build_error{ + "Attempted to allocate device container in non-CUDA build" + }; + } + } +}; + +template +struct fail_container_policy { + using element_type = ElementType; + using container_type = fail_container; + using pointer = typename container_type::pointer; + using const_pointer = typename container_type::const_pointer; +}; + +namespace detail { +template +using default_buffer_host_policy = host_vector_policy; + +#ifdef RAFT_DISABLE_CUDA +#else +template +using default_buffer_device_policy = device_uvector_policy; +#endif +} + +template < + typename ElementType +> +struct default_buffer_container_policy { + using element_type = ElementType; + using container_policy_variant = std::variant< + device_uvector_policy, + host_vector_policy + >; +}; + +template < + typename ElementType, + typename Extents, + typename LayoutPolicy = layout_c_contiguous, + typename ContainerPolicy +struct mdbuffer { +}; + +} From fdefc34aa4d9369820b6edb53280c2dcab9d9065 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 10 Jul 2023 14:31:40 -0400 Subject: [PATCH 31/75] Add placeholder resource for stream view in CUDA-free builds --- .../core/detail/fail_container_policy.hpp | 159 ++++++++++++++++++ .../raft/core/device_container_policy.hpp | 65 +++++++ .../raft/core/host_container_policy.hpp | 48 ++++++ cpp/include/raft/core/mdbuffer.hpp | 102 ++++++----- cpp/include/raft/core/memory_type.hpp | 8 +- .../raft/core/resource/resource_types.hpp | 2 + cpp/include/raft/core/stream_view.hpp | 104 ++++++++++++ 7 files changed, 445 insertions(+), 43 deletions(-) create mode 100644 cpp/include/raft/core/detail/fail_container_policy.hpp create mode 100644 cpp/include/raft/core/stream_view.hpp diff --git a/cpp/include/raft/core/detail/fail_container_policy.hpp b/cpp/include/raft/core/detail/fail_container_policy.hpp new file mode 100644 index 0000000000..e468539a0d --- /dev/null +++ b/cpp/include/raft/core/detail/fail_container_policy.hpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include + +namespace raft { +namespace detail { + +template +struct fail_reference { + using value_type = typename std::remove_cv_t; + using pointer = T*; + using const_pointer = T const*; + + fail_reference() = default; + template + fail_reference(T* ptr, StreamViewType stream) { + throw non_cuda_build_error{ + "Attempted to construct reference to device data in non-CUDA build" + }; + } + + operator value_type() const // NOLINT + { + throw non_cuda_build_error{ + "Attempted to dereference device data in non-CUDA build" + }; + return value_type{}; + } + auto operator=(T const& other) -> fail_reference& + { + throw non_cuda_build_error{ + "Attempted to assign to device data in non-CUDA build" + }; + return *this; + } +}; + +/** A placeholder container which throws an exception on use + * + * This placeholder is used in non-CUDA builds for container types that would + * otherwise be provided with CUDA code. Attempting to construct a non-empty + * container of this type throws an exception indicating that there was an + * attempt to use the device from a non-CUDA build. An example of when this + * might happen is if a downstream application attempts to allocate a device + * mdarray using a library built with non-CUDA RAFT. + */ +template +struct fail_container { + using value_type = T; + using size_type = std::size_t; + + using reference = fail_reference; + using const_reference = fail_reference; + + using pointer = value_type*; + using const_pointer = value_type const*; + + using iterator = pointer; + using const_iterator = const_pointer; + + explicit fail_container(size_t n=size_t{}) { + if (n != size_t{}) { + throw non_cuda_build_error{ + "Attempted to allocate device container in non-CUDA build" + }; + } + } + + template + auto operator[](Index i) noexcept -> reference { + RAFT_LOG_ERROR( + "Attempted to access device data in non-CUDA build" + ); + return reference{}; + } + + template + auto operator[](Index i) const noexcept -> const_reference { + RAFT_LOG_ERROR( + "Attempted to access device data in non-CUDA build" + ); + return const_reference{}; + } + void resize(size_t n) { + if (n != size_t{}) { + throw non_cuda_build_error{ + "Attempted to allocate device container in non-CUDA build" + }; + } + } + + [[nodiscard]] auto data() noexcept -> pointer { return nullptr; } + [[nodiscard]] auto data() const noexcept -> const_pointer { return nullptr; } +}; + + +/** A placeholder container policy which throws an exception on use + * + * This placeholder is used in non-CUDA builds for container types that would + * otherwise be provided with CUDA code. Attempting to construct a non-empty + * container of this type throws an exception indicating that there was an + * attempt to use the device from a non-CUDA build. An example of when this + * might happen is if a downstream application attempts to allocate a device + * mdarray using a library built with non-CUDA RAFT. + */ +template +struct fail_container_policy { + using element_type = ElementType; + using container_type = fail_container; + using pointer = typename container_type::pointer; + using const_pointer = typename container_type::const_pointer; + using reference = typename container_type::reference; + using const_reference = typename container_type::const_reference; + + using accessor_policy = std::experimental::default_accessor; + using const_accessor_policy = std::experimental::default_accessor; + + auto create(raft::resources const& res, size_t n) -> container_type + { + return container_type(n); + } + + fail_container_policy() = default; + + [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference + { + return c[n]; + } + [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept + -> const_reference + { + return c[n]; + } + + [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } + [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } +}; + +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp index eef981e56f..c72f2d2bb2 100644 --- a/cpp/include/raft/core/device_container_policy.hpp +++ b/cpp/include/raft/core/device_container_policy.hpp @@ -21,6 +21,7 @@ * limitations under the License. */ #pragma once +#ifndef RAFT_DISABLE_CUDA #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include @@ -183,4 +185,67 @@ class device_uvector_policy { [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } }; +/** + * @brief A container policy for managed mdarray. + */ +template +class managed_uvector_policy { + public: + using element_type = ElementType; + using container_type = device_uvector; + // FIXME(jiamingy): allocator type is not supported by rmm::device_uvector + using pointer = typename container_type::pointer; + using const_pointer = typename container_type::const_pointer; + using reference = device_reference; + using const_reference = device_reference; + + using accessor_policy = std::experimental::default_accessor; + using const_accessor_policy = std::experimental::default_accessor; + + public: + auto create(raft::resources const& res, size_t n) -> container_type + { + return container_type(n, resource::get_cuda_stream(res), &mr_); + } + + managed_uvector_policy() = default; + + [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference + { + return c[n]; + } + [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept + -> const_reference + { + return c[n]; + } + + [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } + [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + private: + rmm::mr::managed_memory_resource mr_{}; +}; + +} // namespace raft +#else +#include +namespace raft { + +// Provide placeholders that will allow CPU-GPU interoperable codebases to +// compile in non-CUDA mode but which will throw exceptions at runtime on any +// attempt to touch device data + +template +using device_reference = detail::fail_reference; + +template +using device_uvector = detail::fail_container; + +template +using device_uvector_policy = detail::fail_container_policy; + +template +using managed_uvector_policy = detail::fail_container_policy; + } // namespace raft +#endif diff --git a/cpp/include/raft/core/host_container_policy.hpp b/cpp/include/raft/core/host_container_policy.hpp index 3b3538ea20..bbf050fab6 100644 --- a/cpp/include/raft/core/host_container_policy.hpp +++ b/cpp/include/raft/core/host_container_policy.hpp @@ -24,6 +24,13 @@ #include #include #include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#include +#else +#include +#endif namespace raft { @@ -62,4 +69,45 @@ class host_vector_policy { [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } }; + +#ifndef RAFT_DISABLE_CUDA +/** + * @brief A container policy for pinned mdarray. + */ +template +struct pinned_vector_policy { + using element_type = ElementType; + using allocator_type = thrust::mr::stateless_resource_allocator; + using container_type = thrust::host_vector; + using pointer = typename container_type::pointer; + using const_pointer = typename container_type::const_pointer; + using reference = element_type&; + using const_reference = element_type const&; + using accessor_policy = std::experimental::default_accessor; + using const_accessor_policy = std::experimental::default_accessor; + + auto create(raft::resources const&, size_t n) -> container_type { return container_type(n, allocator_); } + + constexpr pinned_vector_policy() noexcept(std::is_nothrow_default_constructible_v) : mr_{}, allocator_{&mr_} {} + + [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference + { + return c[n]; + } + [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept + -> const_reference + { + return c[n]; + } + + [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } + [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + private: + thrust::system::cuda::universal_host_pinned_memory_resource mr_; + allocator_type allocator_; +}; +#else +template +using pinned_vector_policy = detail::fail_container_policy; +#endif } // namespace raft diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 18c738acb7..477b2cdc7e 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -15,7 +15,9 @@ */ #include +#include #include +#include #include namespace raft { @@ -55,64 +57,80 @@ struct buffer_stream_view { } }; #endif -} - -template -struct fail_container { - using pointer = T*; - using const_pointer = T const*; - - using reference = T&; - using const_reference = T const&; - - using iterator = pointer; - using const_iterator = const_pointer; +} // namespace detail - explicit fail_container(size_t n=size_t{}) { - if (n != size_t{}) { - throw non_cuda_build_error{ - "Attempted to allocate device container in non-CUDA build" - }; - } - } -}; - -template -struct fail_container_policy { - using element_type = ElementType; - using container_type = fail_container; - using pointer = typename container_type::pointer; - using const_pointer = typename container_type::const_pointer; -}; - -namespace detail { -template -using default_buffer_host_policy = host_vector_policy; - -#ifdef RAFT_DISABLE_CUDA -#else -template -using default_buffer_device_policy = device_uvector_policy; -#endif +inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type) { + return static_cast>(mem_type); } +template +using alternate_from_mem_type = std::variant_alternative_t; + template < typename ElementType > struct default_buffer_container_policy { using element_type = ElementType; + using value_type = std::remove_cv_t; using container_policy_variant = std::variant< + host_vector_policy, device_uvector_policy, - host_vector_policy + managed_uvector_policy, + pinned_vector_policy >; + + template + using underlying_policy = alternate_from_mem_type; +}; + +template +struct universal_buffer_reference { + using value_type = typename ContainerPolicy::value_type; + using pointer = typename ContainerPolicy::value_type*; + using const_pointer = typename ContainerPolicy::value_type const*; + + using reference_variant = std::variant< + typename ContainerPolicy::template underlying_policy::reference, + typename ContainerPolicy::template underlying_policy::reference, + typename ContainerPolicy::template underlying_policy::reference, + typename ContainerPolicy::template underlying_policy::reference + >; + using const_reference_variant = std::variant< + typename ContainerPolicy::template underlying_policy::const_reference, + typename ContainerPolicy::template underlying_policy::const_reference, + typename ContainerPolicy::template underlying_policy::const_reference, + typename ContainerPolicy::template underlying_policy::const_reference + >; + + universal_buffer_reference(pointer ptr, raft::memory_type mem_type) + : ptr_{ptr}, mem_type_{mem_type} + { + } + private: + pointer ptr_; + raft::memory_type mem_type_; + }; template < typename ElementType, typename Extents, typename LayoutPolicy = layout_c_contiguous, - typename ContainerPolicy -struct mdbuffer { + typename ContainerPolicy = default_buffer_container_policy +> struct mdbuffer { + using extents_type = Extents; + using layout_type = LayoutPolicy; + using mapping_type = typename layout_type::template mapping; + using element_type = ElementType; + + using value_type = std::remove_cv_t; + using index_type = typename extents_type::index_type; + using difference_type = std::ptrdiff_t; + using rank_type = typename extents_type::rank_type; + + using owning_container_variant = std::variant< + mdarray; }; -} +} // namespace raft diff --git a/cpp/include/raft/core/memory_type.hpp b/cpp/include/raft/core/memory_type.hpp index cd37a0ee50..4f40161a25 100644 --- a/cpp/include/raft/core/memory_type.hpp +++ b/cpp/include/raft/core/memory_type.hpp @@ -14,9 +14,15 @@ * limitations under the License. */ #pragma once +#include namespace raft { -enum class memory_type { host, device, managed, pinned }; +enum class memory_type : std::uint8_t { + host = std::uint8_t{0}, + device = std::uint8_t{1}, + managed = std::uint8_t{2}, + pinned = std::uint8_t{3} +}; auto constexpr is_device_accessible(memory_type mem_type) { diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index 2dc4eb1f9d..d3c09437b2 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -39,6 +39,8 @@ enum resource_type { SUB_COMMUNICATOR, // raft sub communicator DEVICE_PROPERTIES, // cuda device properties DEVICE_ID, // cuda device id + STREAM_VIEW, // view of a cuda stream or a placeholder in + // CUDA-free builds THRUST_POLICY, // thrust execution policy WORKSPACE_RESOURCE, // rmm device memory resource diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp new file mode 100644 index 0000000000..1fe3498359 --- /dev/null +++ b/cpp/include/raft/core/stream_view.hpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif + +namespace raft { + +namespace detail { +struct fail_stream_view { + constexpr fail_stream_view(fail_stream_view const&) = default; + constexpr fail_stream_view(fail_stream_view&&) = default; + auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default; + auto constexpr operator=(fail_stream_view&&) -> fail_stream_view& = default; + auto value() { + throw non_cuda_build_error{ + "Attempted to access CUDA stream in non-CUDA build" + }; + } + [[nodiscard]] auto is_per_thread_default() const { + return false; + } + [[nodiscard]] auto is_default() const { + return false; + } + void synchronize() const { + throw non_cuda_build_error{ + "Attempted to sync CUDA stream in non-CUDA build" + }; + } + void synchronize_no_throw() const { + RAFT_LOG_ERROR( + "Attempted to sync CUDA stream in non-CUDA build" + ); + } +}; +} // namespace detail + +/** A lightweight wrapper around rmm::cuda_stream_view that can be used in + * CUDA-free builds + * + * While CUDA-free builds should never actually make use of a CUDA stream at + * runtime, it is sometimes useful to have a symbol that can stand in place of + * a CUDA stream to avoid excessive ifdef directives interspersed with other + * logic. This struct's methods invoke the underlying rmm::cuda_stream_view in + * CUDA-enabled builds but throw runtime exceptions if any non-trivial method + * is called from a CUDA-free build */ +struct stream_view { +#ifndef RAFT_DISABLE_CUDA + using underlying_view_type = rmm::cuda_stream_view; +#else + using underlying_view_type = detail::fail_stream_view; +#endif + constexpr stream_view(stream_view const&) = default; + constexpr stream_view(stream_view&&) = default; + auto operator=(stream_view const&) -> stream_view& = default; + auto operator=(stream_view&&) -> stream_view& = default; + auto value() { + return base_view_.value(); + } + operator underlying_view_type() const noexcept { + return base_view_; + } + [[nodiscard]] auto is_per_thread_default() const { + return base_view_.is_per_thread_default(); + } + [[nodiscard]] auto is_default() const { + return base_view_.is_default(); + } + void synchronize() const { + base_view_.synchronize(); + } + void synchronize_no_throw() const { + base_view_.synchronize_no_throw(); + } + + auto underlying() { + return base_view_; + } + void synchronize_if_cuda() { +#ifndef RAFT_DISABLE_CUDA + base_view_.synchronize(); +#endif + } + private: + underlying_view_type base_view_; +}; + +} // namespace raft From 24223ed8721a118fc2b106c00e604bc90fdfb9d5 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 11 Jul 2023 11:46:32 -0400 Subject: [PATCH 32/75] Add infrastructure for CUDA-free build --- cpp/CMakeLists.txt | 471 +++++++------ cpp/include/raft/core/cuda_support.hpp | 23 + cpp/include/raft/core/error.hpp | 2 +- cpp/include/raft/core/memory_type.hpp | 2 +- .../raft/core/resource/stream_view.hpp | 104 +++ cpp/include/raft/core/stream_view.hpp | 103 +-- cpp/internal/CMakeLists.txt | 6 +- cpp/test/CMakeLists.txt | 662 +++++++++--------- cpp/test/core/stream_view.cpp | 43 ++ 9 files changed, 840 insertions(+), 576 deletions(-) create mode 100644 cpp/include/raft/core/cuda_support.hpp create mode 100644 cpp/include/raft/core/resource/stream_view.hpp create mode 100644 cpp/test/core/stream_view.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6fa1b5830e..0c1e9c1eec 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -13,21 +13,30 @@ set(RAPIDS_VERSION "23.08") set(RAFT_VERSION "23.08.00") +option(DISABLE_CUDA "Disable CUDA" OFF) + cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../fetch_rapids.cmake) include(rapids-cmake) include(rapids-cpm) -include(rapids-cuda) include(rapids-export) include(rapids-find) +if(NOT DISABLE_CUDA) + include(rapids-cuda) + rapids_cuda_init_architectures(RAFT) + project( + RAFT + VERSION ${RAFT_VERSION} + LANGUAGES CXX CUDA + ) +else() + project( + RAFT + VERSION ${RAFT_VERSION} + LANGUAGES CXX + ) +endif() -rapids_cuda_init_architectures(RAFT) - -project( - RAFT - VERSION ${RAFT_VERSION} - LANGUAGES CXX CUDA -) # Write the version header rapids_cmake_write_version_file(include/raft/version_config.hpp) @@ -85,6 +94,7 @@ message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_PRIMS_BENCH}") message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}") message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS}) message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}") +message(VERBOSE "RAFT: Disable CUDA: ${DISABLE_CUDA}") message(VERBOSE "RAFT: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}") message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}") message(VERBOSE "RAFT: Enable nvtx markers: ${RAFT_NVTX}") @@ -123,8 +133,10 @@ if(CUDA_STATIC_RUNTIME) set(_ctk_static_suffix "_static") endif() -# CUDA runtime -rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME}) +if(NOT DISABLE_CUDA) + # CUDA runtime + rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME}) +endif() if(NOT DISABLE_OPENMP) find_package(OpenMP) @@ -133,16 +145,18 @@ if(NOT DISABLE_OPENMP) endif() endif() -# * find CUDAToolkit package -# * determine GPU architectures -# * enable the CMake CUDA language -# * set other CUDA compilation flags -rapids_find_package( - CUDAToolkit REQUIRED - BUILD_EXPORT_SET raft-exports - INSTALL_EXPORT_SET raft-exports -) -include(cmake/modules/ConfigureCUDA.cmake) +if(NOT DISABLE_CUDA) + # * find CUDAToolkit package + # * determine GPU architectures + # * enable the CMake CUDA language + # * set other CUDA compilation flags + rapids_find_package( + CUDAToolkit REQUIRED + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports + ) + include(cmake/modules/ConfigureCUDA.cmake) +endif() # ################################################################################################## # * Requirements ------------------------------------------------------------- @@ -150,13 +164,15 @@ include(cmake/modules/ConfigureCUDA.cmake) # add third party dependencies using CPM rapids_cpm_init() -# thrust before rmm/cuco so we get the right version of thrust/cub -include(cmake/thirdparty/get_thrust.cmake) -include(cmake/thirdparty/get_rmm.cmake) -include(cmake/thirdparty/get_cutlass.cmake) +if(NOT DISABLE_CUDA) + # thrust before rmm/cuco so we get the right version of thrust/cub + include(cmake/thirdparty/get_thrust.cmake) + include(cmake/thirdparty/get_rmm.cmake) + include(cmake/thirdparty/get_cutlass.cmake) -include(${rapids-cmake-dir}/cpm/cuco.cmake) -rapids_cpm_cuco(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports) + include(${rapids-cmake-dir}/cpm/cuco.cmake) + rapids_cpm_cuco(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports) +endif() if(BUILD_TESTS) include(cmake/thirdparty/get_gtest.cmake) @@ -176,23 +192,28 @@ target_include_directories( raft INTERFACE "$" "$" ) -# Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target. -target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass raft::Thrust) +if(NOT DISABLE_CUDA) + # Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target. + target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass raft::Thrust) +endif() target_compile_features(raft INTERFACE cxx_std_17 $) -target_compile_options( - raft INTERFACE $<$:--expt-extended-lambda - --expt-relaxed-constexpr> -) +set(RAFT_CTK_MATH_DEPENDENCIES "") +if(NOT DISABLE_CUDA) + target_compile_options( + raft INTERFACE $<$:--expt-extended-lambda + --expt-relaxed-constexpr> + ) -set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix}) -set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix}) -set(RAFT_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix}) -set(RAFT_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix}) + set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix}) + set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix}) + set(RAFT_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix}) + set(RAFT_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix}) -set(RAFT_CTK_MATH_DEPENDENCIES ${RAFT_CUBLAS_DEPENDENCY} ${RAFT_CUSOLVER_DEPENDENCY} - ${RAFT_CUSPARSE_DEPENDENCY} ${RAFT_CURAND_DEPENDENCY} -) + set(RAFT_CTK_MATH_DEPENDENCIES ${RAFT_CUBLAS_DEPENDENCY} ${RAFT_CUSOLVER_DEPENDENCY} + ${RAFT_CUSPARSE_DEPENDENCY} ${RAFT_CURAND_DEPENDENCY} + ) +endif() # Endian detection include(TestBigEndian) @@ -261,156 +282,173 @@ endif() set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled) if(RAFT_COMPILE_LIBRARY) - add_library( - raft_lib - src/core/logger.cpp - src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu - src/distance/detail/pairwise_matrix/dispatch_rbf.cu - src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu - src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu - src/distance/distance.cu - src/distance/fused_l2_nn.cu - src/linalg/detail/coalesced_reduction.cu - src/matrix/detail/select_k_double_int64_t.cu - src/matrix/detail/select_k_double_uint32_t.cu - src/matrix/detail/select_k_float_int64_t.cu - src/matrix/detail/select_k_float_uint32_t.cu - src/matrix/detail/select_k_float_int32.cu - src/matrix/detail/select_k_half_int64_t.cu - src/matrix/detail/select_k_half_uint32_t.cu - src/neighbors/ball_cover.cu - src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu - src/neighbors/brute_force_knn_int64_t_float_int64_t.cu - src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu - src/neighbors/brute_force_knn_int_float_int.cu - src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu - src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu - src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu - src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu - src/neighbors/detail/ivf_flat_search.cu - src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu - src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu - src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu - src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu - src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu - src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu - src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu - src/neighbors/detail/selection_faiss_int32_t_float.cu - src/neighbors/detail/selection_faiss_int_double.cu - src/neighbors/detail/selection_faiss_long_float.cu - src/neighbors/detail/selection_faiss_size_t_double.cu - src/neighbors/detail/selection_faiss_size_t_float.cu - src/neighbors/detail/selection_faiss_uint32_t_float.cu - src/neighbors/detail/selection_faiss_int64_t_double.cu - src/neighbors/detail/selection_faiss_int64_t_half.cu - src/neighbors/detail/selection_faiss_uint32_t_double.cu - src/neighbors/detail/selection_faiss_uint32_t_half.cu - src/neighbors/ivf_flat_build_float_int64_t.cu - src/neighbors/ivf_flat_build_int8_t_int64_t.cu - src/neighbors/ivf_flat_build_uint8_t_int64_t.cu - src/neighbors/ivf_flat_extend_float_int64_t.cu - src/neighbors/ivf_flat_extend_int8_t_int64_t.cu - src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu - src/neighbors/ivf_flat_search_float_int64_t.cu - src/neighbors/ivf_flat_search_int8_t_int64_t.cu - src/neighbors/ivf_flat_search_uint8_t_int64_t.cu - src/neighbors/ivfpq_build_float_int64_t.cu - src/neighbors/ivfpq_build_int8_t_int64_t.cu - src/neighbors/ivfpq_build_uint8_t_int64_t.cu - src/neighbors/ivfpq_extend_float_int64_t.cu - src/neighbors/ivfpq_extend_int8_t_int64_t.cu - src/neighbors/ivfpq_extend_uint8_t_int64_t.cu - src/neighbors/ivfpq_search_float_int64_t.cu - src/neighbors/ivfpq_search_int8_t_int64_t.cu - src/neighbors/ivfpq_search_uint8_t_int64_t.cu - src/neighbors/refine_float_float.cu - src/neighbors/refine_int8_t_float.cu - src/neighbors/refine_uint8_t_float.cu - src/raft_runtime/cluster/cluster_cost.cuh - src/raft_runtime/cluster/cluster_cost_double.cu - src/raft_runtime/cluster/cluster_cost_float.cu - src/raft_runtime/cluster/kmeans_fit_double.cu - src/raft_runtime/cluster/kmeans_fit_float.cu - src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu - src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu - src/raft_runtime/cluster/update_centroids.cuh - src/raft_runtime/cluster/update_centroids_double.cu - src/raft_runtime/cluster/update_centroids_float.cu - src/raft_runtime/distance/fused_l2_min_arg.cu - src/raft_runtime/distance/pairwise_distance.cu - src/raft_runtime/matrix/select_k_float_int64_t.cu - src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu - src/raft_runtime/neighbors/ivf_flat_build.cu - src/raft_runtime/neighbors/ivf_flat_search.cu - src/raft_runtime/neighbors/ivf_flat_serialize.cu - src/raft_runtime/neighbors/ivfpq_build.cu - src/raft_runtime/neighbors/ivfpq_deserialize.cu - src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu - src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu - src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu - src/raft_runtime/neighbors/ivfpq_serialize.cu - src/raft_runtime/neighbors/refine_d_int64_t_float.cu - src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu - src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu - src/raft_runtime/neighbors/refine_h_int64_t_float.cu - src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu - src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu - src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu - src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu - src/raft_runtime/random/rmat_rectangular_generator_int_double.cu - src/raft_runtime/random/rmat_rectangular_generator_int_float.cu - src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu - src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu - src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu - src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu - src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu - src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu - src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu - src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu - src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu - src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu - src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu - src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu - src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu - src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu - src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu - src/util/memory_pool.cpp - ) - set_target_properties( - raft_lib - PROPERTIES OUTPUT_NAME raft - BUILD_RPATH "\$ORIGIN" - INSTALL_RPATH "\$ORIGIN" - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - POSITION_INDEPENDENT_CODE ON - INTERFACE_POSITION_INDEPENDENT_CODE ON - ) + if(DISABLE_CUDA) + add_library( + raft_lib + src/core/logger.cpp + ) + set_target_properties( + raft_lib + PROPERTIES OUTPUT_NAME raft + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + ) + else() + add_library( + raft_lib + src/core/logger.cpp + src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_rbf.cu + src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu + src/distance/distance.cu + src/distance/fused_l2_nn.cu + src/linalg/detail/coalesced_reduction.cu + src/matrix/detail/select_k_double_int64_t.cu + src/matrix/detail/select_k_double_uint32_t.cu + src/matrix/detail/select_k_float_int64_t.cu + src/matrix/detail/select_k_float_uint32_t.cu + src/matrix/detail/select_k_float_int32.cu + src/matrix/detail/select_k_half_int64_t.cu + src/matrix/detail/select_k_half_uint32_t.cu + src/neighbors/ball_cover.cu + src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu + src/neighbors/brute_force_knn_int64_t_float_int64_t.cu + src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu + src/neighbors/brute_force_knn_int_float_int.cu + src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu + src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu + src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu + src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu + src/neighbors/detail/ivf_flat_search.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu + src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu + src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu + src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu + src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu + src/neighbors/detail/selection_faiss_int32_t_float.cu + src/neighbors/detail/selection_faiss_int_double.cu + src/neighbors/detail/selection_faiss_long_float.cu + src/neighbors/detail/selection_faiss_size_t_double.cu + src/neighbors/detail/selection_faiss_size_t_float.cu + src/neighbors/detail/selection_faiss_uint32_t_float.cu + src/neighbors/detail/selection_faiss_int64_t_double.cu + src/neighbors/detail/selection_faiss_int64_t_half.cu + src/neighbors/detail/selection_faiss_uint32_t_double.cu + src/neighbors/detail/selection_faiss_uint32_t_half.cu + src/neighbors/ivf_flat_build_float_int64_t.cu + src/neighbors/ivf_flat_build_int8_t_int64_t.cu + src/neighbors/ivf_flat_build_uint8_t_int64_t.cu + src/neighbors/ivf_flat_extend_float_int64_t.cu + src/neighbors/ivf_flat_extend_int8_t_int64_t.cu + src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu + src/neighbors/ivf_flat_search_float_int64_t.cu + src/neighbors/ivf_flat_search_int8_t_int64_t.cu + src/neighbors/ivf_flat_search_uint8_t_int64_t.cu + src/neighbors/ivfpq_build_float_int64_t.cu + src/neighbors/ivfpq_build_int8_t_int64_t.cu + src/neighbors/ivfpq_build_uint8_t_int64_t.cu + src/neighbors/ivfpq_extend_float_int64_t.cu + src/neighbors/ivfpq_extend_int8_t_int64_t.cu + src/neighbors/ivfpq_extend_uint8_t_int64_t.cu + src/neighbors/ivfpq_search_float_int64_t.cu + src/neighbors/ivfpq_search_int8_t_int64_t.cu + src/neighbors/ivfpq_search_uint8_t_int64_t.cu + src/neighbors/refine_float_float.cu + src/neighbors/refine_int8_t_float.cu + src/neighbors/refine_uint8_t_float.cu + src/raft_runtime/cluster/cluster_cost.cuh + src/raft_runtime/cluster/cluster_cost_double.cu + src/raft_runtime/cluster/cluster_cost_float.cu + src/raft_runtime/cluster/kmeans_fit_double.cu + src/raft_runtime/cluster/kmeans_fit_float.cu + src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu + src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu + src/raft_runtime/cluster/update_centroids.cuh + src/raft_runtime/cluster/update_centroids_double.cu + src/raft_runtime/cluster/update_centroids_float.cu + src/raft_runtime/distance/fused_l2_min_arg.cu + src/raft_runtime/distance/pairwise_distance.cu + src/raft_runtime/matrix/select_k_float_int64_t.cu + src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu + src/raft_runtime/neighbors/ivf_flat_build.cu + src/raft_runtime/neighbors/ivf_flat_search.cu + src/raft_runtime/neighbors/ivf_flat_serialize.cu + src/raft_runtime/neighbors/ivfpq_build.cu + src/raft_runtime/neighbors/ivfpq_deserialize.cu + src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu + src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu + src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu + src/raft_runtime/neighbors/ivfpq_serialize.cu + src/raft_runtime/neighbors/refine_d_int64_t_float.cu + src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu + src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu + src/raft_runtime/neighbors/refine_h_int64_t_float.cu + src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu + src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu + src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu + src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu + src/raft_runtime/random/rmat_rectangular_generator_int_double.cu + src/raft_runtime/random/rmat_rectangular_generator_int_float.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu + src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu + src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu + src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu + src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu + src/util/memory_pool.cpp + ) + set_target_properties( + raft_lib + PROPERTIES OUTPUT_NAME raft + BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + ) + endif() target_link_libraries( raft_lib @@ -419,20 +457,34 @@ if(RAFT_COMPILE_LIBRARY) # will just be cublas $ ) - target_compile_options( - raft_lib PRIVATE "$<$:${RAFT_CXX_FLAGS}>" - "$<$:${RAFT_CUDA_FLAGS}>" - ) + if(DISABLE_CUDA) + target_compile_options( + raft_lib PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + ) + else() + target_compile_options( + raft_lib PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + "$<$:${RAFT_CUDA_FLAGS}>" + ) + endif() # RAFT_COMPILED is set during compilation of libraft.so as well as downstream libraries (due to # "PUBLIC") target_compile_definitions(raft_lib PUBLIC "RAFT_COMPILED") + if(DISABLE_CUDA) + # Controls whether or not CUDA symbols are used in headers that may be used + # in CUDA-free builds + target_compile_definitions(raft_lib PUBLIC "RAFT_DISABLE_CUDA") + endif() + # RAFT_EXPLICIT_INSTANTIATE_ONLY is set during compilation of libraft.so (due to "PRIVATE") target_compile_definitions(raft_lib PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") - # ensure CUDA symbols aren't relocated to the middle of the debug build binaries - target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") + if(NOT DISABLE_CUDA) + # ensure CUDA symbols aren't relocated to the middle of the debug build binaries + target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") + endif() endif() @@ -446,26 +498,33 @@ target_link_libraries(raft_compiled INTERFACE raft::raft $ +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif + +namespace raft::resource { +struct stream_view_resource : public resource { + stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) + { + } + void* get_resource() override { return &stream; } + + ~stream_view_resource() override {} + + private: + raft::stream_view stream; +}; + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the resources instance. + */ +struct stream_view_resource_factory : public resource_factory { + public: + stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) + : stream(view) + { + } + resource_type get_resource_type() override { return resource_type::STREAM_VIEW; } + resource* make_resource() override { return new stream_view_resource(stream); } + + private: + raft::stream_view stream; +}; + +/** + * @defgroup resource_stream_view stream resource functions compatible with + * non-CUDA builds + * @{ + */ +/** + * Load a raft::stream_view from a resources instance (and populate it on the res + * if needed). + * @param res raft res object for managing resources + * @return + */ +inline raft::stream_view get_stream_view(resources const& res) +{ + if (!res.has_resource_factory(resource_type::STREAM_VIEW)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::STREAM_VIEW); +}; + +/** + * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res + * if needed). + * @param[in] res raft resources object for managing resources + * @param[in] stream_view cuda stream view + */ +inline void set_stream_view(resources const& res, raft::stream_view view) +{ + res.add_resource_factory(std::make_shared(view)); +}; + +/** + * @brief synchronize a specific stream + * + * @param[in] res the raft resources object + * @param[in] stream stream to synchronize + */ +inline void sync_stream_view(const resources& res, raft::stream_view stream) +{ + stream.interruptible_synchronize(); +} + +/** + * @brief synchronize main stream on the resources instance + */ +inline void sync_stream_view(const resources& res) { sync_stream(res, get_stream_view(res)); } + +/** + * @} + */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp index 1fe3498359..1bf8fde6c1 100644 --- a/cpp/include/raft/core/stream_view.hpp +++ b/cpp/include/raft/core/stream_view.hpp @@ -13,9 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include +#include #ifndef RAFT_DISABLE_CUDA +#include #include #endif @@ -23,30 +25,21 @@ namespace raft { namespace detail { struct fail_stream_view { - constexpr fail_stream_view(fail_stream_view const&) = default; - constexpr fail_stream_view(fail_stream_view&&) = default; + constexpr fail_stream_view() = default; + constexpr fail_stream_view(fail_stream_view const&) = default; + constexpr fail_stream_view(fail_stream_view&&) = default; auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default; - auto constexpr operator=(fail_stream_view&&) -> fail_stream_view& = default; - auto value() { - throw non_cuda_build_error{ - "Attempted to access CUDA stream in non-CUDA build" - }; - } - [[nodiscard]] auto is_per_thread_default() const { - return false; + auto constexpr operator=(fail_stream_view&&) -> fail_stream_view& = default; + auto value() { throw non_cuda_build_error{"Attempted to access CUDA stream in non-CUDA build"}; } + [[nodiscard]] auto is_per_thread_default() const { return false; } + [[nodiscard]] auto is_default() const { return false; } + void synchronize() const + { + throw non_cuda_build_error{"Attempted to sync CUDA stream in non-CUDA build"}; } - [[nodiscard]] auto is_default() const { - return false; - } - void synchronize() const { - throw non_cuda_build_error{ - "Attempted to sync CUDA stream in non-CUDA build" - }; - } - void synchronize_no_throw() const { - RAFT_LOG_ERROR( - "Attempted to sync CUDA stream in non-CUDA build" - ); + void synchronize_no_throw() const + { + RAFT_LOG_ERROR("Attempted to sync CUDA stream in non-CUDA build"); } }; } // namespace detail @@ -66,39 +59,51 @@ struct stream_view { #else using underlying_view_type = detail::fail_stream_view; #endif - constexpr stream_view(stream_view const&) = default; - constexpr stream_view(stream_view&&) = default; - auto operator=(stream_view const&) -> stream_view& = default; - auto operator=(stream_view&&) -> stream_view& = default; - auto value() { - return base_view_.value(); - } - operator underlying_view_type() const noexcept { - return base_view_; - } - [[nodiscard]] auto is_per_thread_default() const { - return base_view_.is_per_thread_default(); - } - [[nodiscard]] auto is_default() const { - return base_view_.is_default(); - } - void synchronize() const { - base_view_.synchronize(); - } - void synchronize_no_throw() const { - base_view_.synchronize_no_throw(); - } - auto underlying() { - return base_view_; + constexpr stream_view(underlying_view_type base_view = stream_view::get_underlying_per_thread_default()) + : base_view_{base_view} + { } - void synchronize_if_cuda() { + constexpr stream_view(stream_view const&) = default; + constexpr stream_view(stream_view&&) = default; + auto operator=(stream_view const&) -> stream_view& = default; + auto operator=(stream_view&&) -> stream_view& = default; + auto value() { return base_view_.value(); } + operator underlying_view_type() const noexcept { return base_view_; } + [[nodiscard]] auto is_per_thread_default() const { return base_view_.is_per_thread_default(); } + [[nodiscard]] auto is_default() const { return base_view_.is_default(); } + void synchronize() const { base_view_.synchronize(); } + void synchronize_no_throw() const { base_view_.synchronize_no_throw(); } + void interruptible_synchronize() const + { #ifndef RAFT_DISABLE_CUDA - base_view_.synchronize(); + interruptible::synchronize(base_view_); +#else + synchronize(); #endif } + + auto underlying() { return base_view_; } + void synchronize_if_cuda_enabled() + { + if constexpr (raft::CUDA_ENABLED) { + base_view_.synchronize(); + } + } + private: underlying_view_type base_view_; + auto static get_underlying_per_thread_default() -> underlying_view_type + { +#ifndef RAFT_DISABLE_CUDA + return rmm::cuda_stream_per_thread; +#else + auto static constexpr const default_fail_stream = underlying_view_type{}; + return default_fail_stream; +#endif + } }; +auto static const stream_view_per_thread = stream_view{}; + } // namespace raft diff --git a/cpp/internal/CMakeLists.txt b/cpp/internal/CMakeLists.txt index 5d9e8c6f8b..cae278aa9e 100644 --- a/cpp/internal/CMakeLists.txt +++ b/cpp/internal/CMakeLists.txt @@ -17,5 +17,9 @@ if(BUILD_TESTS OR BUILD_PRIMS_BENCH) target_include_directories( raft_internal INTERFACE "$" ) - target_compile_features(raft_internal INTERFACE cxx_std_17 $) + if(DISABLE_CUDA) + target_compile_features(raft_internal INTERFACE cxx_std_17) + else() + target_compile_features(raft_internal INTERFACE cxx_std_17 $) + endif() endif() diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 33d4dd9423..e3d6e45a47 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -44,20 +44,34 @@ function(ConfigureTest) add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) - set_target_properties( - ${TEST_NAME} - PROPERTIES # set target compile options - INSTALL_RPATH "\$ORIGIN/../../../lib" - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - - target_compile_options( - ${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" - "$<$:${RAFT_CUDA_FLAGS}>" - ) + if(DISABLE_CUDA) + set_target_properties( + ${TEST_NAME} + PROPERTIES # set target compile options + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + ) + + target_compile_options( + ${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + ) + else() + set_target_properties( + ${TEST_NAME} + PROPERTIES # set target compile options + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + + target_compile_options( + ${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + "$<$:${RAFT_CUDA_FLAGS}>" + ) + endif() if(ConfigureTest_EXPLICIT_INSTANTIATE_ONLY) target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") @@ -81,309 +95,321 @@ endfunction() # * distance tests ------------------------------------------------------------------------- if(BUILD_TESTS) - ConfigureTest( - NAME - CLUSTER_TEST - PATH - test/cluster/kmeans.cu - test/cluster/kmeans_balanced.cu - test/cluster/cluster_solvers.cu - test/cluster/linkage.cu - test/cluster/kmeans_find_k.cu - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - CORE_TEST - PATH - test/core/logger.cpp - test/core/math_device.cu - test/core/math_host.cpp - test/core/operators_device.cu - test/core/operators_host.cpp - test/core/handle.cpp - test/core/interruptible.cu - test/core/nvtx.cpp - test/core/mdarray.cu - test/core/mdspan_utils.cu - test/core/numpy_serializer.cu - test/core/memory_type.cpp - test/core/sparse_matrix.cu - test/core/sparse_matrix.cpp - test/core/span.cpp - test/core/span.cu - test/core/temporary_device_buffer.cu - test/test.cpp - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - DISTANCE_TEST - PATH - test/distance/dist_adj.cu - test/distance/dist_adj_distance_instance.cu - test/distance/dist_canberra.cu - test/distance/dist_correlation.cu - test/distance/dist_cos.cu - test/distance/dist_hamming.cu - test/distance/dist_hellinger.cu - test/distance/dist_inner_product.cu - test/distance/dist_jensen_shannon.cu - test/distance/dist_kl_divergence.cu - test/distance/dist_l1.cu - test/distance/dist_l2_exp.cu - test/distance/dist_l2_unexp.cu - test/distance/dist_l2_sqrt_exp.cu - test/distance/dist_l_inf.cu - test/distance/dist_lp_unexp.cu - test/distance/dist_russell_rao.cu - test/distance/masked_nn.cu - test/distance/masked_nn_compress_to_bits.cu - test/distance/fused_l2_nn.cu - test/distance/gram.cu - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - list( - APPEND - EXT_HEADER_TEST_SOURCES - test/ext_headers/raft_neighbors_brute_force.cu - test/ext_headers/raft_distance_distance.cu - test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu - test/ext_headers/raft_matrix_detail_select_k.cu - test/ext_headers/raft_neighbors_ball_cover.cu - test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu - test/ext_headers/raft_distance_fused_l2_nn.cu - test/ext_headers/raft_neighbors_ivf_pq.cu - test/ext_headers/raft_util_memory_pool.cpp - test/ext_headers/raft_neighbors_ivf_flat.cu - test/ext_headers/raft_core_logger.cpp - test/ext_headers/raft_neighbors_refine.cu - test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu - test/ext_headers/raft_neighbors_detail_selection_faiss.cu - test/ext_headers/raft_linalg_detail_coalesced_reduction.cu - test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu - test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu - test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu - ) - - # Test that the split headers compile in isolation with: - # - # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined - # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined - # * EXT_HEADERS_TEST_IMPLICIT: no macros defined. - ConfigureTest( - NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB - EXPLICIT_INSTANTIATE_ONLY - ) - ConfigureTest( - NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB - ) - ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES}) - - ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu) - - ConfigureTest( - NAME - LINALG_TEST - PATH - test/linalg/add.cu - test/linalg/axpy.cu - test/linalg/binary_op.cu - test/linalg/cholesky_r1.cu - test/linalg/coalesced_reduction.cu - test/linalg/divide.cu - test/linalg/dot.cu - test/linalg/eig.cu - test/linalg/eig_sel.cu - test/linalg/gemm_layout.cu - test/linalg/gemv.cu - test/linalg/map.cu - test/linalg/map_then_reduce.cu - test/linalg/matrix_vector.cu - test/linalg/matrix_vector_op.cu - test/linalg/mean_squared_error.cu - test/linalg/multiply.cu - test/linalg/norm.cu - test/linalg/normalize.cu - test/linalg/power.cu - test/linalg/randomized_svd.cu - test/linalg/reduce.cu - test/linalg/reduce_cols_by_key.cu - test/linalg/reduce_rows_by_key.cu - test/linalg/rsvd.cu - test/linalg/sqrt.cu - test/linalg/strided_reduction.cu - test/linalg/subtract.cu - test/linalg/svd.cu - test/linalg/ternary_op.cu - test/linalg/transpose.cu - test/linalg/unary_op.cu - ) - - ConfigureTest( - NAME - MATRIX_TEST - PATH - test/matrix/argmax.cu - test/matrix/argmin.cu - test/matrix/columnSort.cu - test/matrix/diagonal.cu - test/matrix/gather.cu - test/matrix/eye.cu - test/matrix/linewise_op.cu - test/matrix/math.cu - test/matrix/matrix.cu - test/matrix/norm.cu - test/matrix/reverse.cu - test/matrix/select_k.cu - test/matrix/slice.cu - test/matrix/triangular.cu - test/sparse/spectral_matrix.cu - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - RANDOM_TEST - PATH - test/random/make_blobs.cu - test/random/make_regression.cu - test/random/multi_variable_gaussian.cu - test/random/permute.cu - test/random/rng.cu - test/random/rng_discrete.cu - test/random/rng_int.cu - test/random/rmat_rectangular_generator.cu - test/random/sample_without_replacement.cu - ) - - ConfigureTest( - NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu - test/lap/lap.cu test/sparse/mst.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - SPARSE_TEST - PATH - test/sparse/add.cu - test/sparse/convert_coo.cu - test/sparse/convert_csr.cu - test/sparse/csr_row_slice.cu - test/sparse/csr_to_dense.cu - test/sparse/csr_transpose.cu - test/sparse/degree.cu - test/sparse/filter.cu - test/sparse/norm.cu - test/sparse/normalize.cu - test/sparse/reduce.cu - test/sparse/row_op.cu - test/sparse/sort.cu - test/sparse/spgemmi.cu - test/sparse/symmetrize.cu - ) - - ConfigureTest( - NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu - test/sparse/gram.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - SPARSE_NEIGHBORS_TEST - PATH - test/sparse/neighbors/connect_components.cu - test/sparse/neighbors/brute_force.cu - test/sparse/neighbors/knn_graph.cu - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - NEIGHBORS_TEST - PATH - test/neighbors/ann_cagra/test_float_uint32_t.cu - test/neighbors/ann_cagra/test_int8_t_uint32_t.cu - test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu - test/neighbors/ann_cagra/test_float_int64_t.cu - test/neighbors/ann_ivf_flat/test_float_int64_t.cu - test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu - test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu - test/neighbors/ann_ivf_pq/test_float_int64_t.cu - test/neighbors/ann_ivf_pq/test_float_uint32_t.cu - test/neighbors/ann_ivf_pq/test_float_int64_t.cu - test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu - test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu - test/neighbors/knn.cu - test/neighbors/fused_l2_knn.cu - test/neighbors/tiled_knn.cu - test/neighbors/haversine.cu - test/neighbors/ball_cover.cu - test/neighbors/epsilon_neighborhood.cu - test/neighbors/refine.cu - test/neighbors/selection.cu - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - STATS_TEST - PATH - test/stats/accuracy.cu - test/stats/adjusted_rand_index.cu - test/stats/completeness_score.cu - test/stats/contingencyMatrix.cu - test/stats/cov.cu - test/stats/dispersion.cu - test/stats/entropy.cu - test/stats/histogram.cu - test/stats/homogeneity_score.cu - test/stats/information_criterion.cu - test/stats/kl_divergence.cu - test/stats/mean.cu - test/stats/meanvar.cu - test/stats/mean_center.cu - test/stats/minmax.cu - test/stats/mutual_info_score.cu - test/stats/r2_score.cu - test/stats/rand_index.cu - test/stats/regression_metrics.cu - test/stats/silhouette_score.cu - test/stats/stddev.cu - test/stats/sum.cu - test/stats/trustworthiness.cu - test/stats/weighted_mean.cu - test/stats/v_measure.cu - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) + if(NOT DISABLE_CUDA) + ConfigureTest( + NAME + CLUSTER_TEST + PATH + test/cluster/kmeans.cu + test/cluster/kmeans_balanced.cu + test/cluster/cluster_solvers.cu + test/cluster/linkage.cu + test/cluster/kmeans_find_k.cu + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + CORE_TEST + PATH + test/core/logger.cpp + test/core/math_device.cu + test/core/math_host.cpp + test/core/operators_device.cu + test/core/operators_host.cpp + test/core/handle.cpp + test/core/interruptible.cu + test/core/nvtx.cpp + test/core/mdarray.cu + test/core/mdspan_utils.cu + test/core/numpy_serializer.cu + test/core/memory_type.cpp + test/core/sparse_matrix.cu + test/core/sparse_matrix.cpp + test/core/span.cpp + test/core/span.cu + test/core/temporary_device_buffer.cu + test/test.cpp + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + DISTANCE_TEST + PATH + test/distance/dist_adj.cu + test/distance/dist_adj_distance_instance.cu + test/distance/dist_canberra.cu + test/distance/dist_correlation.cu + test/distance/dist_cos.cu + test/distance/dist_hamming.cu + test/distance/dist_hellinger.cu + test/distance/dist_inner_product.cu + test/distance/dist_jensen_shannon.cu + test/distance/dist_kl_divergence.cu + test/distance/dist_l1.cu + test/distance/dist_l2_exp.cu + test/distance/dist_l2_unexp.cu + test/distance/dist_l2_sqrt_exp.cu + test/distance/dist_l_inf.cu + test/distance/dist_lp_unexp.cu + test/distance/dist_russell_rao.cu + test/distance/masked_nn.cu + test/distance/masked_nn_compress_to_bits.cu + test/distance/fused_l2_nn.cu + test/distance/gram.cu + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + list( + APPEND + EXT_HEADER_TEST_SOURCES + test/ext_headers/raft_neighbors_brute_force.cu + test/ext_headers/raft_distance_distance.cu + test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu + test/ext_headers/raft_matrix_detail_select_k.cu + test/ext_headers/raft_neighbors_ball_cover.cu + test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu + test/ext_headers/raft_distance_fused_l2_nn.cu + test/ext_headers/raft_neighbors_ivf_pq.cu + test/ext_headers/raft_util_memory_pool.cpp + test/ext_headers/raft_neighbors_ivf_flat.cu + test/ext_headers/raft_core_logger.cpp + test/ext_headers/raft_neighbors_refine.cu + test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu + test/ext_headers/raft_neighbors_detail_selection_faiss.cu + test/ext_headers/raft_linalg_detail_coalesced_reduction.cu + test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu + test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu + test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu + ) + + # Test that the split headers compile in isolation with: + # + # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined + # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined + # * EXT_HEADERS_TEST_IMPLICIT: no macros defined. + ConfigureTest( + NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB + EXPLICIT_INSTANTIATE_ONLY + ) + ConfigureTest( + NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB + ) + ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES}) + + ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu) + + ConfigureTest( + NAME + LINALG_TEST + PATH + test/linalg/add.cu + test/linalg/axpy.cu + test/linalg/binary_op.cu + test/linalg/cholesky_r1.cu + test/linalg/coalesced_reduction.cu + test/linalg/divide.cu + test/linalg/dot.cu + test/linalg/eig.cu + test/linalg/eig_sel.cu + test/linalg/gemm_layout.cu + test/linalg/gemv.cu + test/linalg/map.cu + test/linalg/map_then_reduce.cu + test/linalg/matrix_vector.cu + test/linalg/matrix_vector_op.cu + test/linalg/mean_squared_error.cu + test/linalg/multiply.cu + test/linalg/norm.cu + test/linalg/normalize.cu + test/linalg/power.cu + test/linalg/randomized_svd.cu + test/linalg/reduce.cu + test/linalg/reduce_cols_by_key.cu + test/linalg/reduce_rows_by_key.cu + test/linalg/rsvd.cu + test/linalg/sqrt.cu + test/linalg/strided_reduction.cu + test/linalg/subtract.cu + test/linalg/svd.cu + test/linalg/ternary_op.cu + test/linalg/transpose.cu + test/linalg/unary_op.cu + ) + + ConfigureTest( + NAME + MATRIX_TEST + PATH + test/matrix/argmax.cu + test/matrix/argmin.cu + test/matrix/columnSort.cu + test/matrix/diagonal.cu + test/matrix/gather.cu + test/matrix/eye.cu + test/matrix/linewise_op.cu + test/matrix/math.cu + test/matrix/matrix.cu + test/matrix/norm.cu + test/matrix/reverse.cu + test/matrix/select_k.cu + test/matrix/slice.cu + test/matrix/triangular.cu + test/sparse/spectral_matrix.cu + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + RANDOM_TEST + PATH + test/random/make_blobs.cu + test/random/make_regression.cu + test/random/multi_variable_gaussian.cu + test/random/permute.cu + test/random/rng.cu + test/random/rng_discrete.cu + test/random/rng_int.cu + test/random/rmat_rectangular_generator.cu + test/random/sample_without_replacement.cu + ) + + ConfigureTest( + NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu + test/lap/lap.cu test/sparse/mst.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + SPARSE_TEST + PATH + test/sparse/add.cu + test/sparse/convert_coo.cu + test/sparse/convert_csr.cu + test/sparse/csr_row_slice.cu + test/sparse/csr_to_dense.cu + test/sparse/csr_transpose.cu + test/sparse/degree.cu + test/sparse/filter.cu + test/sparse/norm.cu + test/sparse/normalize.cu + test/sparse/reduce.cu + test/sparse/row_op.cu + test/sparse/sort.cu + test/sparse/spgemmi.cu + test/sparse/symmetrize.cu + ) + + ConfigureTest( + NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu + test/sparse/gram.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + SPARSE_NEIGHBORS_TEST + PATH + test/sparse/neighbors/connect_components.cu + test/sparse/neighbors/brute_force.cu + test/sparse/neighbors/knn_graph.cu + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + NEIGHBORS_TEST + PATH + test/neighbors/ann_cagra/test_float_uint32_t.cu + test/neighbors/ann_cagra/test_int8_t_uint32_t.cu + test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu + test/neighbors/ann_cagra/test_float_int64_t.cu + test/neighbors/ann_ivf_flat/test_float_int64_t.cu + test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu + test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu + test/neighbors/ann_ivf_pq/test_float_int64_t.cu + test/neighbors/ann_ivf_pq/test_float_uint32_t.cu + test/neighbors/ann_ivf_pq/test_float_int64_t.cu + test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu + test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu + test/neighbors/knn.cu + test/neighbors/fused_l2_knn.cu + test/neighbors/tiled_knn.cu + test/neighbors/haversine.cu + test/neighbors/ball_cover.cu + test/neighbors/epsilon_neighborhood.cu + test/neighbors/refine.cu + test/neighbors/selection.cu + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + STATS_TEST + PATH + test/stats/accuracy.cu + test/stats/adjusted_rand_index.cu + test/stats/completeness_score.cu + test/stats/contingencyMatrix.cu + test/stats/cov.cu + test/stats/dispersion.cu + test/stats/entropy.cu + test/stats/histogram.cu + test/stats/homogeneity_score.cu + test/stats/information_criterion.cu + test/stats/kl_divergence.cu + test/stats/mean.cu + test/stats/meanvar.cu + test/stats/mean_center.cu + test/stats/minmax.cu + test/stats/mutual_info_score.cu + test/stats/r2_score.cu + test/stats/rand_index.cu + test/stats/regression_metrics.cu + test/stats/silhouette_score.cu + test/stats/stddev.cu + test/stats/sum.cu + test/stats/trustworthiness.cu + test/stats/weighted_mean.cu + test/stats/v_measure.cu + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + endif() - ConfigureTest( - NAME - UTILS_TEST - PATH - test/core/seive.cu - test/util/bitonic_sort.cu - test/util/cudart_utils.cpp - test/util/device_atomics.cu - test/util/integer_utils.cpp - test/util/pow2_utils.cu - test/util/reduction.cu - ) + if(RAFT_DISABLE_CUDA) + ConfigureTest( + NAME + UTILS_TEST + PATH + test/core/stream_view.cpp + ) + else() + ConfigureTest( + NAME + UTILS_TEST + PATH + test/core/seive.cu + test/core/stream_view.cpp + test/util/bitonic_sort.cu + test/util/cudart_utils.cpp + test/util/device_atomics.cu + test/util/integer_utils.cpp + test/util/pow2_utils.cu + test/util/reduction.cu + ) + endif() endif() diff --git a/cpp/test/core/stream_view.cpp b/cpp/test/core/stream_view.cpp new file mode 100644 index 0000000000..895ac18c79 --- /dev/null +++ b/cpp/test/core/stream_view.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif +namespace raft { +TEST(StreamView, Default) { + auto stream = stream_view_per_thread; + ASSERT_EQ(stream.is_per_thread_default(), raft::CUDA_ENABLED); + ASSERT_FALSE(stream.is_default()); + if (raft::CUDA_ENABLED) { + EXPECT_NO_THROW(stream.synchronize()); + EXPECT_NO_THROW(stream.interruptible_synchronize()); + } else { + EXPECT_THROW(stream.synchronize(), raft::non_cuda_build_error); + EXPECT_THROW(stream.interruptible_synchronize(), raft::non_cuda_build_error); + } + EXPECT_NO_THROW(stream.synchronize_no_throw()); + EXPECT_NO_THROW(stream.synchronize_if_cuda_enabled()); +#ifndef RAFT_DISABLE_CUDA + static_assert( + std::is_same_v, "underlying should return rmm::cuda_stream_view" + ); +#endif +} +} // namespace raft From 46890525b4b8f5c4550d0cf736bbaf5757689e10 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 11 Jul 2023 15:15:32 -0400 Subject: [PATCH 33/75] Add initial set of CUDA-free tests --- build.sh | 9 +++++++- cpp/CMakeLists.txt | 7 +++++- cpp/cmake/thirdparty/get_fmt.cmake | 22 ++++++++++++++++++ cpp/cmake/thirdparty/get_spdlog.cmake | 33 +++++++++++++++++++++++++++ cpp/test/CMakeLists.txt | 27 +++++++++++++--------- 5 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 cpp/cmake/thirdparty/get_fmt.cmake create mode 100644 cpp/cmake/thirdparty/get_spdlog.cmake diff --git a/build.sh b/build.sh index ab904abdad..7ce3980c4c 100755 --- a/build.sh +++ b/build.sh @@ -18,7 +18,7 @@ ARGS=$* # scripts, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h" +VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall -v -g -n --compile-lib --allgpuarch --no-nvtx --disable-cuda --show_depr_warn --incl-cache-stats --time -h" HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=] [--limit-tests=] [--limit-bench-prims=] [--limit-bench-ann=] [--build-metrics=] where is: clean - remove all existing build artifacts and configuration (start over) @@ -44,6 +44,7 @@ HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=) @@ -240,7 +245,7 @@ endif() # ################################################################################################## # * NVTX support in raft ----------------------------------------------------- -if(RAFT_NVTX) +if(RAFT_NVTX AND (NOT DISABLE_CUDA)) # This enables NVTX within the project with no option to disable it downstream. target_link_libraries(raft INTERFACE CUDA::nvToolsExt) target_compile_definitions(raft INTERFACE NVTX_ENABLED) diff --git a/cpp/cmake/thirdparty/get_fmt.cmake b/cpp/cmake/thirdparty/get_fmt.cmake new file mode 100644 index 0000000000..5787fb73fb --- /dev/null +++ b/cpp/cmake/thirdparty/get_fmt.cmake @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone fmt +function(find_and_configure_fmt) + + include(${rapids-cmake-dir}/cpm/fmt.cmake) + rapids_cpm_fmt(INSTALL_EXPORT_SET rmm-exports BUILD_EXPORT_SET rmm-exports) +endfunction() + +find_and_configure_fmt() diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake new file mode 100644 index 0000000000..24bbea89d5 --- /dev/null +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -0,0 +1,33 @@ +# ============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone speedlog +function(find_and_configure_spdlog) + + include(${rapids-cmake-dir}/cpm/spdlog.cmake) + rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports) + rapids_export_package(BUILD spdlog rmm-exports) + + if(spdlog_ADDED) + rapids_export( + BUILD spdlog + EXPORT_SET spdlog + GLOBAL_TARGETS spdlog spdlog_header_only + NAMESPACE spdlog::) + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] rmm-exports) + endif() +endfunction() + +find_and_configure_spdlog() diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index e3d6e45a47..8ca541073c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -56,6 +56,7 @@ function(ConfigureTest) target_compile_options( ${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" ) + target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA") else() set_target_properties( ${TEST_NAME} @@ -130,6 +131,7 @@ if(BUILD_TESTS) test/core/sparse_matrix.cpp test/core/span.cpp test/core/span.cu + test/core/stream_view.cpp test/core/temporary_device_buffer.cu test/test.cpp OPTIONAL @@ -388,22 +390,11 @@ if(BUILD_TESTS) LIB EXPLICIT_INSTANTIATE_ONLY ) - endif() - - if(RAFT_DISABLE_CUDA) - ConfigureTest( - NAME - UTILS_TEST - PATH - test/core/stream_view.cpp - ) - else() ConfigureTest( NAME UTILS_TEST PATH test/core/seive.cu - test/core/stream_view.cpp test/util/bitonic_sort.cu test/util/cudart_utils.cpp test/util/device_atomics.cu @@ -411,5 +402,19 @@ if(BUILD_TESTS) test/util/pow2_utils.cu test/util/reduction.cu ) + else() + ConfigureTest( + NAME + CORE_TEST + PATH + test/core/logger.cpp + test/core/math_host.cpp + test/core/operators_host.cpp + test/core/memory_type.cpp + test/core/stream_view.cpp + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY + ) endif() endif() From 1b7e1e5be9dd691290ab9eb259b7309d6c579603 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 17 Jul 2023 14:36:38 -0400 Subject: [PATCH 34/75] Add variant types to mdbuffer --- cpp/include/raft/core/mdbuffer.hpp | 375 ++++++++++++++++++++++----- cpp/include/raft/util/type_utils.hpp | 53 ++++ 2 files changed, 360 insertions(+), 68 deletions(-) create mode 100644 cpp/include/raft/util/type_utils.hpp diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index 477b2cdc7e..f9b3a8d8cf 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -18,97 +18,262 @@ #include #include #include -#include +#include +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif namespace raft { -namespace detail { -#ifdef RAFT_DISABLE_CUDA -using buffer_stream_view = rmm::cuda_stream_view; -#else -struct buffer_stream_view { - auto value() const { - throw non_cuda_build_error{ - "Attempted to access CUDA stream in non-CUDA build" - }; - } - [[nodiscard]] auto is_per_thread_default() const { - throw non_cuda_build_error{ - "Attempted to access CUDA stream in non-CUDA build" - }; - return false; - } - [[nodiscard]] auto is_default() const { - throw non_cuda_build_error{ - "Attempted to access CUDA stream in non-CUDA build" - }; - return false; - } - void synchronize() const { - throw non_cuda_build_error{ - "Attempted to sync CUDA stream in non-CUDA build" - }; +inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type) { + return static_cast>(mem_type); +} + +template +using alternate_from_mem_type = std::variant_alternative_t; + + +template +using default_container_policy_variant = std::variant< + host_vector_policy, + device_uvector_policy, + managed_uvector_policy, + pinned_vector_policy +>; + +template > +struct universal_buffer_reference { + using value_type = typename std::remove_cv_t; + using pointer = value_type*; + using const_pointer = value_type const*; + + template < + typename RefType, + std::enable_if_t::reference, RefType + >, + std::is_same_v< + typename alternate_from_mem_type::reference, RefType + >, + std::is_same_v< + typename alternate_from_mem_type::reference, RefType + >, + std::is_same_v< + typename alternate_from_mem_type::reference, RefType + > + >> + > + + universal_buffer_reference(pointer ptr, memory_type mem_type, stream_view stream=stream_view_per_thread) + : ptr_{ptr}, mem_type_{mem_type}, stream_{stream} + { } - void synchronize_no_throw() const { - RAFT_LOG_ERROR( - "Attempted to sync CUDA stream in non-CUDA build" +#ifndef RAFT_DISABLE_CUDA + explicit universal_buffer_reference(thrust::device_ptr ptr, + memory_type mem_type=memory_type::device, + stream_view stream=stream_view_per_thread) + : universal_buffer_reference{ptr.get(), mem_type, stream} + { + RAFT_EXPECTS( + is_device_accessible(mem_type), + "Attempted to create host-only reference from Thrust device pointer" ); } -}; #endif -} // namespace detail -inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type) { - return static_cast>(mem_type); -} + operator value_type() const // NOLINT + { + auto result = value_type{}; + if (is_host_accessible(mem_type_)) { + result = *ptr_; + } else { +#ifdef RAFT_DISABLE_CUDA + throw non_cuda_build_error{ + "Attempted to access device reference in non-CUDA build" + }; +#else + update_host(&result, ptr_, 1, stream_); +#endif + } + return result; + } -template -using alternate_from_mem_type = std::variant_alternative_t; + auto operator=(value_type const& other) -> universal_buffer_reference& + { + if (is_host_accessible(mem_type_)) { + *ptr_ = other; + } else { +#ifdef RAFT_DISABLE_CUDA + throw non_cuda_build_error{ + "Attempted to assign to device reference in non-CUDA build" + }; +#else + update_device(ptr_, &other, 1, stream_); +#endif + } + return *this; + } + + private: + pointer ptr_; + raft::memory_type mem_type_; + raft::stream_view stream_; +}; template < - typename ElementType + typename ElementType, + typename ContainerPolicyVariant=default_container_policy_variant > struct default_buffer_container_policy { using element_type = ElementType; using value_type = std::remove_cv_t; - using container_policy_variant = std::variant< - host_vector_policy, - device_uvector_policy, - managed_uvector_policy, - pinned_vector_policy + + using reference = universal_buffer_reference; + using const_reference = universal_buffer_reference; + using pointer = element_type*; + using const_pointer = element_type const*; + + using container_policy_variant = ContainerPolicyVariant; + + template + using container_policy = alternate_from_mem_type; + + private: + template + using container_policy_at_index = std::variant_alternative_t; + + public: + using container_type_variant = std::variant< + typename container_policy_at_index<0>::container_type, + typename container_policy_at_index<1>::container_type, + typename container_policy_at_index<2>::container_type, + typename container_policy_at_index<3>::container_type >; template - using underlying_policy = alternate_from_mem_type; -}; + using container_type = alternate_from_mem_type; -template -struct universal_buffer_reference { - using value_type = typename ContainerPolicy::value_type; - using pointer = typename ContainerPolicy::value_type*; - using const_pointer = typename ContainerPolicy::value_type const*; - - using reference_variant = std::variant< - typename ContainerPolicy::template underlying_policy::reference, - typename ContainerPolicy::template underlying_policy::reference, - typename ContainerPolicy::template underlying_policy::reference, - typename ContainerPolicy::template underlying_policy::reference + using accessor_policy_variant = std::variant< + typename container_policy_at_index<0>::accessor_policy, + typename container_policy_at_index<1>::accessor_policy, + typename container_policy_at_index<2>::accessor_policy, + typename container_policy_at_index<3>::accessor_policy >; - using const_reference_variant = std::variant< - typename ContainerPolicy::template underlying_policy::const_reference, - typename ContainerPolicy::template underlying_policy::const_reference, - typename ContainerPolicy::template underlying_policy::const_reference, - typename ContainerPolicy::template underlying_policy::const_reference + + template + using accessor_policy = alternate_from_mem_type; + + using const_accessor_policy_variant = std::variant< + typename container_policy_at_index<0>::const_accessor_policy, + typename container_policy_at_index<1>::const_accessor_policy, + typename container_policy_at_index<2>::const_accessor_policy, + typename container_policy_at_index<3>::const_accessor_policy >; - universal_buffer_reference(pointer ptr, raft::memory_type mem_type) - : ptr_{ptr}, mem_type_{mem_type} - { + template + using const_accessor_policy = alternate_from_mem_type; + + template + auto create(raft::resources const& res, size_t n) { + return container_type(res, n); } + + auto create(raft::resources const& res, size_t n, raft::memory_type mem_type) { + auto result = container_type_variant{}; + switch(mem_type) { + case raft::memory_type::host: + result = create(res, n); + break; + case raft::memory_type::device: + result = create(res, n); + break; + case raft::memory_type::managed: + result = create(res, n); + break; + case raft::memory_type::pinned: + result = create(res, n); + break; + } + return result; + } + private: - pointer ptr_; - raft::memory_type mem_type_; + template + auto static constexpr has_stream(ContainerType c) -> decltype(c.stream(), bool) { + return true; + }; + auto static has_stream(...) -> bool { + return false; + }; + + public: + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept { + return reference{c.data() + n, MemType, c.stream()}; + } + + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept { + return reference{c.data() + n, MemType}; + } + + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type const& c, std::size_t n) const noexcept { + return const_reference{c.data() + n, MemType, c.stream()}; + } + + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type const& c, std::size_t n) const noexcept { + return const_reference{c.data() + n, MemType}; + } + + template + [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } + template + [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + + [[nodiscard]] auto make_accessor_policy(memory_type mem_type) noexcept { + auto result = accessor_policy_variant{}; + switch(mem_type) { + case memory_type::host: + result = make_accessor_policy(); + break; + case memory_type::device: + result = make_accessor_policy(); + break; + case memory_type::managed: + result = make_accessor_policy(); + break; + case memory_type::pinned: + result = make_accessor_policy(); + break; + } + return result; +} + [[nodiscard]] auto make_accessor_policy(memory_type mem_type) const noexcept { + auto result = const_accessor_policy_variant{}; + switch(mem_type) { + case memory_type::host: + result = make_accessor_policy(); + break; + case memory_type::device: + result = make_accessor_policy(); + break; + case memory_type::managed: + result = make_accessor_policy(); + break; + case memory_type::pinned: + result = make_accessor_policy(); + break; + } + return result; +} }; @@ -128,9 +293,83 @@ template < using difference_type = std::ptrdiff_t; using rank_type = typename extents_type::rank_type; - using owning_container_variant = std::variant< - mdarray + using container_type = typename container_policy_type::template container_type; + + using pointer = typename container_policy_type::pointer; + using const_pointer = typename container_policy_type::const_pointer; + using reference = typename container_policy_type::reference; + using const_reference = typename container_policy_type::const_reference; + + template + using owning_type = mdarray< + element_type, + extents_type, + layout_type, + typename container_policy_type::template container_policy + >; + using owning_type_variant = std::variant< + owning_type(0)>, + owning_type(1)>, + owning_type(2)>, + owning_type(3)> + >; + + template + using view_type = typename owning_type::view_type; + + using view_type_variant = std::variant< + view_type(0)>, + view_type(1)>, + view_type(2)>, + view_type(3)> >; + + template + using const_view_type = typename owning_type::const_view_type; + using const_view_type_variant = std::variant< + const_view_type(0)>, + const_view_type(1)>, + const_view_type(2)>, + const_view_type(3)> + >; + + using storage_type_variant = concatenated_variant_t; + + template + using storage_type = std::variant_alternative_t< + std::size_t{is_owning} * std::variant_size_v + + std::size_t{variant_index_from_memory_type(MemType)}, + storage_type_variant + >; + + constexpr mdbuffer() = default; + // The following constructor is included purely for symmetry with + // mdarray. + constexpr explicit mdbuffer(raft::resources const& handle) : mdbuffer{} {} + + [[nodiscard]] auto constexpr mem_type() { + return static_cast(data_.index() % std::variant_size_v); + }; + [[nodiscard]] auto constexpr is_owning() { + return data_.index() >= std::variant_size_v; + }; + + [[nodiscard]] auto view() { + auto result = view_type_variant{}; + switch(data_.index()) { + case variant_index_from_memory_type(memory_type::host): + result = std::get(data_); + + } + } + + private: + storage_type_variant data_{}; }; } // namespace raft diff --git a/cpp/include/raft/util/type_utils.hpp b/cpp/include/raft/util/type_utils.hpp new file mode 100644 index 0000000000..1721d56f34 --- /dev/null +++ b/cpp/include/raft/util/type_utils.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace raft { + +template +struct concatenated_variant; + +template +struct concatenated_variant , std::variant>{ + using type = std::variant; +}; + +template +using concatenated_variant_t = typename concatenated_variant::type; + +template +auto fast_visit (visitor_t&& visitor, variant_t&& variant) { + using return_t = decltype( + std::forward(visitor)(std::get(std::forward(variant)))); + auto result = return_t{}; + + if (index == variant.index()) { + if (!std::holds_alternative>(variant)) { + __builtin_unreachable(); + } + result = std::forward(visitor)(std::get(std::forward(variant))); + } else if (index < std::variant_size_v) { + result = fast_visit( + std::forward(visitor), + std::forward(variant) + ); + } else { + __builtin_unreachable(); + } + return result; +} +} // namespace raft From 5416ceb240055acbd2c7ab822cb1538b050df7ce Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 18 Jul 2023 17:27:42 -0400 Subject: [PATCH 35/75] Provide all mdarray/mdspan to mdbuffer conversions --- cpp/include/raft/core/mdbuffer.hpp | 98 +++++++++++++------ .../{type_utils.hpp => variant_utils.hpp} | 26 ++--- cpp/test/CMakeLists.txt | 2 + cpp/test/core/mdbuffer.cpp | 66 +++++++++++++ cpp/test/core/mdbuffer.cu | 23 +++++ 5 files changed, 171 insertions(+), 44 deletions(-) rename cpp/include/raft/util/{type_utils.hpp => variant_utils.hpp} (69%) create mode 100644 cpp/test/core/mdbuffer.cpp create mode 100644 cpp/test/core/mdbuffer.cu diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index f9b3a8d8cf..bb67cd795b 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -14,14 +14,18 @@ * limitations under the License. */ +#include +#include #include #include #include #include +#include #include #include +#include #include -#include +#include #ifndef RAFT_DISABLE_CUDA #include #include @@ -51,24 +55,6 @@ struct universal_buffer_reference { using pointer = value_type*; using const_pointer = value_type const*; - template < - typename RefType, - std::enable_if_t::reference, RefType - >, - std::is_same_v< - typename alternate_from_mem_type::reference, RefType - >, - std::is_same_v< - typename alternate_from_mem_type::reference, RefType - >, - std::is_same_v< - typename alternate_from_mem_type::reference, RefType - > - >> - > - universal_buffer_reference(pointer ptr, memory_type mem_type, stream_view stream=stream_view_per_thread) : ptr_{ptr}, mem_type_{mem_type}, stream_{stream} { @@ -142,7 +128,7 @@ struct default_buffer_container_policy { using container_policy_variant = ContainerPolicyVariant; template - using container_policy = alternate_from_mem_type; + using container_policy = host_device_accessor, MemType>; private: template @@ -205,10 +191,10 @@ struct default_buffer_container_policy { private: template - auto static constexpr has_stream(ContainerType c) -> decltype(c.stream(), bool) { + auto static constexpr has_stream() -> decltype(std::declval().stream(), bool()) { return true; }; - auto static has_stream(...) -> bool { + auto static constexpr has_stream(...) -> bool { return false; }; @@ -295,7 +281,7 @@ template < using container_policy_type = ContainerPolicy; - using container_type_variant = typename container_policy_type::container_type; + using container_type_variant = typename container_policy_type::container_type_variant; template using container_type = typename container_policy_type::template container_type; @@ -348,9 +334,24 @@ template < >; constexpr mdbuffer() = default; - // The following constructor is included purely for symmetry with - // mdarray. - constexpr explicit mdbuffer(raft::resources const& handle) : mdbuffer{} {} + + template , storage_type_variant>>* = nullptr> + constexpr mdbuffer(mdspan other) + : data_{other} + { + } + + template ::view_type, storage_type_variant>>* = nullptr> + constexpr mdbuffer(mdarray& other) + : mdbuffer{other.view()} + { + } + + template , storage_type_variant>>* = nullptr> + constexpr mdbuffer(mdarray&& other) + : data_{std::move(other)} + { + } [[nodiscard]] auto constexpr mem_type() { return static_cast(data_.index() % std::variant_size_v); @@ -358,14 +359,47 @@ template < [[nodiscard]] auto constexpr is_owning() { return data_.index() >= std::variant_size_v; }; + [[nodiscard]] auto constexpr data_handle() { + return fast_visit([](auto&& inner) { + if constexpr (std::is_convertible_v) { + return pointer{inner.data_handle()}; + } else { + return pointer{inner.data_handle().get()}; + } + }, data_); + }; + [[nodiscard]] auto constexpr data_handle() const { + return fast_visit([](auto&& inner) { + if constexpr (std::is_convertible_v) { + return const_pointer{inner.data_handle()}; + } else { + return const_pointer{inner.data_handle().get()}; + } + }, data_); + } - [[nodiscard]] auto view() { - auto result = view_type_variant{}; - switch(data_.index()) { - case variant_index_from_memory_type(memory_type::host): - result = std::get(data_); + private: + static auto constexpr get_view_from_data(view_type_variant const& data) { + return data; + } + static auto constexpr get_view_from_data(const_view_type_variant const& data) { + return data; + } + static auto constexpr get_view_from_data(owning_type_variant& data) { + return view_type_variant{data.view()}; + } + static auto constexpr get_view_from_data(owning_type_variant const& data) { + return const_view_type_variant{data.view()}; + } - } + public: + [[nodiscard]] auto view() { + return fast_visit( + [](auto&& inner) { + return get_view_from_data(inner); + }, + data_ + ); } private: diff --git a/cpp/include/raft/util/type_utils.hpp b/cpp/include/raft/util/variant_utils.hpp similarity index 69% rename from cpp/include/raft/util/type_utils.hpp rename to cpp/include/raft/util/variant_utils.hpp index 1721d56f34..d8c7a45efe 100644 --- a/cpp/include/raft/util/type_utils.hpp +++ b/cpp/include/raft/util/variant_utils.hpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include namespace raft { @@ -32,22 +33,23 @@ using concatenated_variant_t = typename concatenated_variant template auto fast_visit (visitor_t&& visitor, variant_t&& variant) { using return_t = decltype( - std::forward(visitor)(std::get(std::forward(variant)))); + std::forward(visitor)(std::get<0>(variant)) + ); auto result = return_t{}; - if (index == variant.index()) { - if (!std::holds_alternative>(variant)) { - __builtin_unreachable(); - } - result = std::forward(visitor)(std::get(std::forward(variant))); - } else if (index < std::variant_size_v) { - result = fast_visit( - std::forward(visitor), - std::forward(variant) - ); + if constexpr (index == std::variant_size_v>>) { + __builtin_unreachable(); } else { - __builtin_unreachable(); + if (index == variant.index()) { + result = std::forward(visitor)(std::get(std::forward(variant))); + } else { + result = fast_visit( + std::forward(visitor), + std::forward(variant) + ); + } } return result; } + } // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 8ca541073c..af569a60cb 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -124,6 +124,8 @@ if(BUILD_TESTS) test/core/interruptible.cu test/core/nvtx.cpp test/core/mdarray.cu + test/core/mdbuffer.cpp + test/core/mdbuffer.cu test/core/mdspan_utils.cu test/core/numpy_serializer.cu test/core/memory_type.cpp diff --git a/cpp/test/core/mdbuffer.cpp b/cpp/test/core/mdbuffer.cpp new file mode 100644 index 0000000000..72b7264bd7 --- /dev/null +++ b/cpp/test/core/mdbuffer.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif +namespace raft { +TEST(MDBuffer, DefaultConstructor) { + auto buf = mdbuffer>{}; +} + +TEST(MDBuffer, FromHost) { + auto res = raft::resources{}; + auto rows = 3; + auto features = 5; + auto matrix = make_host_matrix(res, rows, features); + auto buf = mdbuffer{matrix}; + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_FALSE(buf.is_owning()); + ASSERT_EQ(buf.data_handle(), matrix.data_handle()); + + auto* ptr = matrix.data_handle(); + buf = mdbuffer{std::move(matrix)}; + ASSERT_EQ(buf.mem_type(), memory_type::host); + ASSERT_TRUE(buf.is_owning()); + ASSERT_EQ(buf.data_handle(), ptr); +} + +TEST(MDBuffer, FromDevice) { + auto res = raft::resources{}; + auto rows = 3; + auto features = 5; + auto matrix = make_device_matrix(res, rows, features); + auto buf = mdbuffer{matrix}; + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_FALSE(buf.is_owning()); + ASSERT_EQ(buf.data_handle(), matrix.data_handle()); + + auto* ptr = matrix.data_handle(); + buf = mdbuffer{std::move(matrix)}; + ASSERT_EQ(buf.mem_type(), memory_type::device); + ASSERT_TRUE(buf.is_owning()); + ASSERT_EQ(buf.data_handle(), ptr); +} +} // namespace raft + diff --git a/cpp/test/core/mdbuffer.cu b/cpp/test/core/mdbuffer.cu new file mode 100644 index 0000000000..4843f0616d --- /dev/null +++ b/cpp/test/core/mdbuffer.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +namespace raft { +} // namespace raft From 355b3d4fa8467a01d4d620772747704ded960134 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 31 Jul 2023 17:25:07 -0400 Subject: [PATCH 36/75] Begin creating buffer copy utilities --- cpp/include/raft/core/detail/mdspan_copy.cuh | 42 +++++++ cpp/include/raft/core/detail/mdspan_copy.hpp | 121 +++++++++++++++++++ cpp/include/raft/core/mdbuffer.hpp | 115 +++++++++++++++++- 3 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 cpp/include/raft/core/detail/mdspan_copy.cuh create mode 100644 cpp/include/raft/core/detail/mdspan_copy.hpp diff --git a/cpp/include/raft/core/detail/mdspan_copy.cuh b/cpp/include/raft/core/detail/mdspan_copy.cuh new file mode 100644 index 0000000000..7d86935a92 --- /dev/null +++ b/cpp/include/raft/core/detail/mdspan_copy.cuh @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace raft { +namespace detail { +auto static constexpr const TRANSPOSE_TILE_DIM = 32; + +template +__global__ void transpose( + OutType* out, + InType* in, + IndexType in_major_dim, + IndexType in_minor_dim +) { + __shared__ OutType tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1]; + auto static constexpr const TILE_ELEMENTS = ( + TRANSPOSE_TILE_DIM * TRANSPOSE_TILE_DIM + ); + auto const max_index = in_major_dim * in_minor_dim; + + for (auto i=0; i < max_index; i += TILE_ELEMENTS) { + auto in_x = blockIdx.x * TRANSPOSE_TILE_DIM + threadIdx.x; + auto in_y = blockIdx.y * TRANSPOSE_TILE_DIM + threadIdx.y; + tile[in_x][in_y] = static_cast(in[in_major * in_x + in_y]); + } +} + +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp new file mode 100644 index 0000000000..f74bab33a4 --- /dev/null +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif + +namespace raft { +namespace detail { +template < + typename DstElementType, + typename DstExtents, + typename DstLayoutPolicy, + typename DstAccessorPolicy, + typename SrcElementType, + typename SrcExtents, + typename SrcLayoutPolicy, + typename SrcAccessorPolicy, + typename ExecutionPolicy, + std::enable_if_t, + SrcExtents::rank() == DstExtents::rank() + >>* = nullptr +> +void copy( + resources const& res, + mdspan & dst, + mdspan const& src, + ExecutionPolicy host_exec_policy = std::execution::unseq +) { + // TODO(Check size match?) + if constexpr ( + // Contiguous memory, no transpose required + std::conjunction_v< + std::is_same_v, + std::disjunction_v< + std::is_same_v, + std::is_same_v + > + > + ) { + if constexpr ( + std::disjunction_v< + std::conjunction_v< + CUDA_ENABLED, + ! DstAccessorPolicy::mem_type::is_device_accessible, + ! SrcAccessorPolicy::mem_type::is_device_accessible + >, + std::conjunction_v< + ! CUDA_ENABLED, + DstAccessorPolicy::mem_type::is_host_accessible, + SrcAccessorPolicy::mem_type::is_host_accessible + > + > + ) { + std::copy( + host_exec_policy, + src.data_handle(), + src.data_handle() + src.size(), + dst.data_handle() + ); + } else { +#ifndef RAFT_DISABLE_CUDA + if constexpr(std::is_same_v>) { + raft::copy( + dst.data_handle(), + src.data_handle(), + src.size(), + get_stream_view(res) + ); + } else { + // TODO(wphicks): Convert type on src device and then copy + } +#else + throw non_cuda_build_error{ + "Attempted copy to/from device in non-CUDA build" + }; +#endif + } + } else { // Non-contiguous memory or transpose required + if constexpr ( + std::conjunction_v< + DstAccessorPolicy::mem_type::is_device_accessible, + SrcAccessorPolicy::mem_type::is_device_accessible + > + ) { + // TODO(wphicks): Conversion/transpose kernel + } else if constexpr ( + std::conjunction_v< + DstAccessorPolicy::mem_type::is_host_accessible, + SrcAccessorPolicy::mem_type::is_host_accessible + > + ) { + // TODO(wphicks): CPU conversion + } else { + // TODO(wphicks): Copy to intermediate mdarray on dest device, then call + // recursively for transpose/conversion + } + } +} +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index bb67cd795b..a73e5b1249 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include +#include #include #include #include +#include #include #include #include @@ -40,6 +43,101 @@ inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type) template using alternate_from_mem_type = std::variant_alternative_t; +namespace detail { + +template < + typename DstElementType, + typename DstExtents, + typename DstLayoutPolicy, + typename DstAccessorPolicy, + typename SrcElementType, + typename SrcExtents, + typename SrcLayoutPolicy, + typename SrcAccessorPolicy, + typename ExecutionPolicy, + std::enable_if_t, + SrcExtents::rank() == DstExtents::rank() + >>* = nullptr +> +void copy( + resources const& res, + mdspan & dst, + mdspan const& src, + ExecutionPolicy host_exec_policy = std::execution::unseq +) { + // TODO(Check size match?) + if constexpr ( + // Contiguous memory, no transpose required + std::conjunction_v< + std::is_same_v, + std::disjunction_v< + std::is_same_v, + std::is_same_v + > + > + ) { + if constexpr ( + std::disjunction_v< + std::conjunction_v< + CUDA_ENABLED, + ! DstAccessorPolicy::mem_type::is_device_accessible, + ! SrcAccessorPolicy::mem_type::is_device_accessible + >, + std::conjunction_v< + ! CUDA_ENABLED, + DstAccessorPolicy::mem_type::is_host_accessible, + SrcAccessorPolicy::mem_type::is_host_accessible + >, + > + ) { + std::copy( + host_exec_policy, + src.data_handle(), + src.data_handle() + src.size(), + dst.data_handle() + ); + } else { +#ifndef RAFT_DISABLE_CUDA + if constexpr(std::is_same_v)) { + raft::copy( + dst.data_handle(), + src.data_handle(), + src.size(), + get_stream_view(res) + ); + } else { + // TODO(wphicks): Convert type on src device and then copy + } +#else + throw non_cuda_build_error{ + "Attempted copy to/from device in non-CUDA build" + }; +#endif + } + } else { // Non-contiguous memory or transpose required + if constexpr ( + std::conjunction_v< + DstAccessorPolicy::mem_type::is_device_accessible, + SrcAccessorPolicy::mem_type::is_device_accessible + > + ) { + // TODO(wphicks): Conversion/transpose kernel + } else if constexpr ( + std::conjunction_v< + DstAccessorPolicy::mem_type::is_host_accessible, + SrcAccessorPolicy::mem_type::is_host_accessible + > + ) { + // TODO(wphicks): CPU conversion + } else { + // TODO(wphicks): Copy to intermediate mdarray on dest device, then call + // recursively for transpose/conversion + } + } +} +} // namespace detail + template using default_container_policy_variant = std::variant< @@ -337,7 +435,7 @@ template < template , storage_type_variant>>* = nullptr> constexpr mdbuffer(mdspan other) - : data_{other} + : data_{std::move(other)} { } @@ -353,6 +451,21 @@ template < { } + template , + Extents::rank() == OtherExtents::rank() + >>* = nullptr> + constexpr mdbuffer( + resources const& res, + mdbuffer const& other) + : data_{other.data_} + { + } + [[nodiscard]] auto constexpr mem_type() { return static_cast(data_.index() % std::variant_size_v); }; From 4770a837feb7a2c735dd1ae47f89572807b16e98 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 18 Aug 2023 17:23:14 -0400 Subject: [PATCH 37/75] Correct computation of dest indices --- cpp/include/raft/core/detail/mdspan_copy.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.cuh b/cpp/include/raft/core/detail/mdspan_copy.cuh index 41724cfe02..2cdde90e98 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.cuh +++ b/cpp/include/raft/core/detail/mdspan_copy.cuh @@ -121,11 +121,11 @@ mdspan_device_copy(DstType dst, SrcType src) get_mdspan_elem(dst, dst_indices) = tile(tile_quick, tile_slow) } } - increment_indices(dst_indices, max_indices, gridDim.x); + increment_indices(dst_indices, max_indices, gridDim.x); } - increment_indices(dst_indices, max_indices, gridDim.y * TileDim); + increment_indices(dst_indices, max_indices, gridDim.y * TileDim); } - valid_indices &= increment_indices( + valid_indices &= increment_indices( src_indices, max_indices, blockDim.x * tile_elements); increment_indices(dst_indices, max_indices, blockDim.x * tile_elements); __syncthreads(); From 8237a74cd16bd17c43134563a6576bd937135467 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 23 Aug 2023 14:42:26 -0400 Subject: [PATCH 38/75] Temporarily remove simd-accelerated copy --- cpp/include/raft/core/detail/mdspan_copy.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index faecd9bfc6..8eb618681e 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -118,12 +118,10 @@ copy(resources const& res, DstType& dst, SrcType const& src) if constexpr (same_layout && both_contiguous) { // Use STL if possible; this should be well optimized std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); - } else if constexpr (both_contiguous && both_float_or_double && simd_available) { - // Next, use SIMD intrinsics if possible, since generic one-by-one copy implementation is hard - // for the compiler to vectorize - - // simd transpose, possibly with dtype conversion } else { + // TODO (wphicks): Use SIMD for both_contiguous && + // both_float_or_double + // Finally, copy elements one by one, trying at least to perform // cache-friendly reads From 022cf6e70e628cdd4f9d8a6f857b15a3f772a926 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 29 Aug 2023 16:57:01 -0400 Subject: [PATCH 39/75] Add initial mdspan copy utility implementation --- cpp/include/raft/core/detail/mdspan_copy.hpp | 214 +++++++++++++++---- cpp/include/raft/core/mdspan_copy.cuh | 23 ++ cpp/include/raft/core/mdspan_copy.hpp | 21 ++ cpp/test/CMakeLists.txt | 11 +- cpp/test/core/mdspan_copy.cpp | 84 ++++++++ 5 files changed, 309 insertions(+), 44 deletions(-) create mode 100644 cpp/include/raft/core/mdspan_copy.cuh create mode 100644 cpp/include/raft/core/mdspan_copy.hpp create mode 100644 cpp/test/core/mdspan_copy.cpp diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 8eb618681e..e4a74572c1 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -27,63 +27,194 @@ #include #include #include + #ifdef __CUDACC__ +#include + #endif #endif namespace raft { namespace detail { + +template +struct mdspan_copyable{}; + +template +struct mdspan_copyable { + using dst_type = std::remove_reference_t; + using src_type = std::remove_reference_t; + + // Dtype properties + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + using dst_element_type = typename dst_type::element_type; + using src_element_type = typename src_type::element_type; + auto static constexpr const same_dtype = std::is_same_v; + + auto static constexpr const dst_float = std::is_same_v; + auto static constexpr const src_float = std::is_same_v; + auto static constexpr const dst_double = std::is_same_v; + auto static constexpr const src_double = std::is_same_v; + + auto static constexpr const both_float = dst_float && src_float; + auto static constexpr const both_double = dst_double && src_double; + auto static constexpr const both_float_or_both_double = both_float || both_double; + + // Ranks + auto static constexpr const dst_rank = dst_type::extents_type::rank(); + auto static constexpr const src_rank = src_type::extents_type::rank(); + auto static constexpr const compatible_rank = (dst_rank == src_rank); + auto static constexpr const vector_rank = (dst_rank == 1); + auto static constexpr const matrix_rank = (dst_rank == 2); + + // Layout properties + using dst_layout_type = typename dst_type::layout_type; + using src_layout_type = typename src_type::layout_type; + + auto static constexpr const src_contiguous = std::disjunction_v< + std::is_same_v, + std::is_same_v + >; + + auto static constexpr const dst_contiguous = std::disjunction_v< + std::is_same_v, + std::is_same_v + >; + + auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; + + // Accessibility + auto static constexpr const dst_device_accessible = is_device_mdspan_v; + auto static constexpr const src_device_accessible = is_device_mdspan_v; + auto static constexpr const both_device_accessible = dst_device_accessible && src_device_accessible; + + auto static constexpr const dst_host_accessible = is_host_mdspan_v; + auto static constexpr const src_host_accessible = is_host_mdspan_v; + auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; + + auto static constexpr const can_use_device = std::conjunction_v; + + auto static constexpr const can_use_host = both_host_accessible; + +#if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) + auto static constexpr const can_use_simd = both_host_accessible; +# else + auto static constexpr const can_use_simd = false; +#endif + + // Viable overload? + using type = std::enable_if_t< + std::conjunction_v< + is_mdspan, + is_mdspan, + std::is_convertible, + std::bool_constant, + std::bool_constant + >, T + >; +}; + +// Need custom kernel if... +template +struct mdspan_copy_requires_custom_kernel : std::conjunction< + // CUDA build is enabled... + std::bool_constant, + // and both mdspans can be accessed on device... + std::bool_constant, SrcType>>, + // and we cannot use cudaMemcpyAsync or cuBLAS. + std::bool_constant::value_type, typename SrcType::value_type>, + // and layout is contiguous... + std::conjunction< + std::disjunction< + std::is_same::layout_type, layout_c_contiguous>, + std::is_same::layout_type, layout_f_contiguous> + >, + std::disjunction< + std::is_same, + std::is_same + > + >, + // and EITHER... + std::disjunction< + // the mdspans have the same layout (cudaMemcpyAsync)... + std::is_same::layout_type, typename SrcType::layout_type>, + // OR the mdspans are 1D (in which case the underlying memory layout + // is actually the same... + std::bool_constant::extents_type::rank() == 1>, + // OR the data are a 2D matrix of either floats or doubles, in which + // case we can perform the transpose with cuBLAS + std::conjunction< + std::bool_constant::extents_type::rank() == 2>, + std::disjunction< + std::is_same::value_type, float>, + std::is_same::value_type, double> + > // end float or double check + > // end cuBLAS compatibility check + > // end cudaMemcpy || cuBLAS check + >> +> {}; + +template +auto constexpr mdspan_copy_requires_custom_kernel_v = mdspan_copy_requires_custom_kernel, SrcType>{}(); + + template std::enable_if_t< - std::conjunction_v, - is_mdspan_v, - std::is_convertible_v, - DstType::extents::rank() == SrcType::extents::rank()>> -copy(resources const& res, DstType& dst, SrcType const& src) + std::conjunction_v, SrcType>, + std::is_convertible_v::element_type>, + std::remove_reference_t::extents_type::rank() == SrcType::extents_type::rank()>> +copy(resources const& res, DstType&& dst, SrcType const& src) { using index_type = - std::conditional_t<(std::numeric_limits::max() > - std::numeric_limits::max()), - typename DstType::extents::index_type, - typename SrcType::extents::index_type>; + std::conditional_t<(std::numeric_limits::extents_type::index_type>::max() > + std::numeric_limits::max()), + typename std::remove_reference_t::extents_type::index_type, + typename SrcType::extents_type::index_type>; auto constexpr const both_contiguous = std::conjunction_v< - std::disjunction_v, - std::is_same_v>, + std::disjunction_v::layout_type, layout_c_contiguous>, + std::is_same_v::layout_type, layout_f_contiguous>>, std::disjunction_v, std::is_same_v>>; - auto constexpr const same_dtype = std::is_same_v; - auto constexpr const both_device_accessible = - std::conjunction_v, is_device_mdspan_v>; - auto constexpr const both_host_accessible = - std::conjunction_v, is_host_mdspan_v>; - auto constexpr const same_layout = std::is_same_v; + auto constexpr const same_dtype = std::is_same_v::value_type, typename SrcType::value_type>; + auto constexpr const both_device_accessible = is_device_mdspan_v, SrcType>; + auto constexpr const both_host_accessible = is_host_mdspan_v, SrcType>; + auto constexpr const same_layout = std::is_same_v::layout_type, typename SrcType::layout_type>; auto constexpr const can_use_device = std::conjunction_v; auto constexpr const both_float_or_double = - std::conjunction_v, - std::is_same_v>, - std::disjunction_v, - std::is_same_v>>; + std::conjunction_v::value_type, float>, + std::is_same_v::value_type, double>>, + std::disjunction_v, + std::is_same_v>>; auto constexpr const simd_available = false; // TODO(wphicks) - // TODO(wphicks): Think about data on different devices + // TODO(wphicks): If data are on different devices, perform a + // cudaMemcpyPeer and then call recursively if constexpr (!can_use_device) { - RAFT_EXPECTS(both_host_accessible, + static_assert(both_host_accessible, "Copying to/from non-host-accessible mdspan in non-CUDA-enabled build"); } - for (auto i = std::size_t{}; i < SrcType::extents::rank(); ++i) { + for (auto i = std::size_t{}; i < SrcType::extents_type::rank(); ++i) { RAFT_EXPECTS(src.extents(i) == dst.extents(i), "Must copy between mdspans of the same shape"); } - if constexpr (both_device_accessible && CUDA_ENABLED) { + if constexpr (can_use_device) { #ifndef RAFT_DISABLE_CUDA - if constexpr (same_dtype && same_layout && both_contiguous) { - // TODO(wphicks): stream - raft::copy(dst.data_handle(), src.data_handle(), dst.size()); + if constexpr (same_dtype && (same_layout || std::remove_reference_t::extents_type::rank() == 1) && both_contiguous) { + raft::copy( + dst.data_handle(), + src.data_handle(), + dst.size(), + resource::get_cuda_stream(res) + ); } else if constexpr (same_dtype && both_float_or_double && both_contiguous && - DstType::extents::rank() == 2) { - auto constexpr const alpha = typename DstType::value_type{1}; - auto constexpr const beta = typename DstType::value_type{0}; + std::remove_reference_t::extents_type::rank() == 2) { + auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; + auto constexpr const beta = typename std::remove_reference_t::value_type{0}; CUBLAS_TRY(cublasgeam(resource::get_cublas_handle(res), CUBLAS_OP_T, CUBLAS_OP_N, @@ -93,29 +224,28 @@ copy(resources const& res, DstType& dst, SrcType const& src) src.data_handle(), src.stride(0), &beta, - static_cast(nullptr), + static_cast::value_type*>(nullptr), dst.stride(0), dst.data_handle(), dst.stride(0), resource::get_cuda_stream(res))); } else { #ifdef __CUDACC__ - // custom kernel + // TODO(wphicks): Call kernel here #else // Ordinarily, we would just make this a .cuh file, but we do not want // to signal that it *must* be built with CUDA. Instead, if this header // is used in a way that requires a CUDA compiler, we fail with an // informative error message. static_assert( - !CUDA_ENABLED, - "When used in a CUDA-enabled build for non-trivial copies on device, mdspan_copy.hpp " - "includes a kernel launch and must be compiled with a CUDA-enabled compiler. Use this " - "header in a '.cu' file to ensure it is correctly compiled."); + !mdspan_copy_requires_custom_kernel_v, SrcType>, + "Selected instantiation of raft::copy requires nvcc compilation. Use raft/core/mdspan_copy.cuh instead of raft/core/mdspan_copy.hpp and #include it in a .cu file. The corresponding 'detail' headers should not be included anywhere else directly." + ); #endif } #endif } else if constexpr (both_host_accessible) { - if constexpr (same_layout && both_contiguous) { + if constexpr ((same_layout || std::remove_reference_t::extents_type::rank() == 1) && both_contiguous) { // Use STL if possible; this should be well optimized std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); } else { @@ -125,12 +255,12 @@ copy(resources const& res, DstType& dst, SrcType const& src) // Finally, copy elements one by one, trying at least to perform // cache-friendly reads - auto indices = std::array{}; + auto indices = std::array::extents_type::rank()>{}; for (auto i = std::size_t{}; i < dst.size(); ++i) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v::layout_type, layout_c_contiguous>) { // For layout_right/layout_c_contiguous, we iterate over the // rightmost extent fastest - auto dim = DstType::extents::rank(); + auto dim = std::remove_reference_t::extents_type::rank(); while ((indices[dim]++) == dst.extent(dim)) { indices[dim] = index_type{}; --dim; @@ -156,7 +286,7 @@ copy(resources const& res, DstType& dst, SrcType const& src) #ifndef RAFT_DISABLE_CUDA if constexpr (same_dtype && same_layout && both_contiguous) { raft::copy(dst.data_handle(), src.data_handle(), dst.size()); - } else if constexpr (is_device_mdspan_v) { + } else if constexpr (is_device_mdspan_v>) { // Copy to device memory and call recursively } else { // Copy to host memory and call recursively diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh new file mode 100644 index 0000000000..93cf853c9c --- /dev/null +++ b/cpp/include/raft/core/mdspan_copy.cuh @@ -0,0 +1,23 @@ +#pragma once +#include +#include +#include +#include +#include +#include +namespace raft { + +template +std::enable_if_t< + std::conjunction_v< + std::bool_constant>, + detail::mdspan_copy_requires_custom_kernel, + std::is_convertible, + std::bool_constant + > +> copy(resources const& res, DstType&& dst, SrcType const& src) { + detail::copy(res, dst, src); +} + +} // namespace raft + diff --git a/cpp/include/raft/core/mdspan_copy.hpp b/cpp/include/raft/core/mdspan_copy.hpp new file mode 100644 index 0000000000..166a6ec547 --- /dev/null +++ b/cpp/include/raft/core/mdspan_copy.hpp @@ -0,0 +1,21 @@ +#pragma once +#include +#include +#include +#include +#include +namespace raft { + +template +std::enable_if_t< + std::conjunction_v< + std::bool_constant, SrcType>>, + std::bool_constant, SrcType>>, + std::is_convertible::element_type>, + std::bool_constant::extents_type::rank() == SrcType::extents_type::rank()> + > +> copy(resources const& res, DstType&& dst, SrcType const& src) { + detail::copy(res, dst, src); +} + +} // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 9b52a4a27b..11c4afae85 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -131,6 +131,7 @@ if(BUILD_TESTS) test/core/interruptible.cu test/core/nvtx.cpp test/core/mdarray.cu + test/core/mdspan_copy.cpp test/core/mdspan_utils.cu test/core/numpy_serializer.cu test/core/memory_type.cpp @@ -440,8 +441,14 @@ if(BUILD_TESTS) ) else() ConfigureTest( - NAME CORE_TEST PATH test/core/logger.cpp test/core/math_host.cpp test/core/operators_host.cpp - test/core/memory_type.cpp test/core/stream_view.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + NAME CORE_TEST PATH + test/core/logger.cpp + test/core/math_host.cpp + test/core/operators_host.cpp + test/core/memory_type.cpp + test/core/mdspan_copy.cpp + test/core/stream_view.cpp + OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) endif() endif() diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp new file mode 100644 index 0000000000..665f8afe75 --- /dev/null +++ b/cpp/test/core/mdspan_copy.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +TEST(MDSpanCopy, Mdspan1D) { + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { + return x; + }; + for (auto i=std::uint32_t{}; i < cols; ++i) { + in(i) = gen_unique_entry(i); + } + + auto out_different_contiguous_layout = make_host_vector(res, cols); + copy(res, out_different_contiguous_layout.view(), in.view()); + for (auto i=std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE(match(out_different_contiguous_layout(i), double(gen_unique_entry(i)), CompareApprox{0.0001})); + } +} + +TEST(MDSpanCopy, Mdspan3D) { + auto res = device_resources{}; + auto constexpr depth = std::uint32_t{5}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; + auto in = make_host_mdarray( + res, + extents{} + ); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { + return x * 7 + y * 11 + z * 13; + }; + + for (auto i=std::uint32_t{}; i < depth; ++i) { + for (auto j=std::uint32_t{}; j < rows; ++j) { + for (auto k=std::uint32_t{}; k < cols; ++k) { + in(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_different_contiguous_layout = make_host_mdarray( + res, + extents{} + ); + copy(res, out_different_contiguous_layout.view(), in.view()); + + for (auto i=std::uint32_t{}; i < depth; ++i) { + for (auto j=std::uint32_t{}; j < rows; ++j) { + for (auto k=std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_different_contiguous_layout(i, j, k), + double(gen_unique_entry(i, j, k)), + CompareApprox{0.0001} + )); + } + } + } + +} +} // namespace raft From a1776f4254a6a44d533a5a70b3f7e9a2d634d314 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Thu, 31 Aug 2023 11:19:01 -0400 Subject: [PATCH 40/75] Refactor copy properties detection --- cpp/include/raft/core/detail/mdspan_copy.hpp | 247 ++++++++++++------- 1 file changed, 157 insertions(+), 90 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index e4a74572c1..b51dba95ee 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -24,6 +24,7 @@ #include #include #ifndef RAFT_DISABLE_CUDA +#include #include #include #include @@ -49,6 +50,7 @@ struct mdspan_copyable { using dst_element_type = typename dst_type::element_type; using src_element_type = typename src_type::element_type; auto static constexpr const same_dtype = std::is_same_v; + auto static constexpr const compatible_dtype = std::is_convertible_v; auto static constexpr const dst_float = std::is_same_v; auto static constexpr const src_float = std::is_same_v; @@ -70,6 +72,8 @@ struct mdspan_copyable { using dst_layout_type = typename dst_type::layout_type; using src_layout_type = typename src_type::layout_type; + auto static constexpr const same_layout = std::is_same_v; + auto static constexpr const src_contiguous = std::disjunction_v< std::is_same_v, std::is_same_v @@ -82,6 +86,12 @@ struct mdspan_copyable { auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; + auto static constexpr const same_underlying_layout = std::disjunction_v< + std::bool_constant, + std::bool_constant + >; + + // Accessibility auto static constexpr const dst_device_accessible = is_device_mdspan_v; auto static constexpr const src_device_accessible = is_device_mdspan_v; @@ -91,118 +101,175 @@ struct mdspan_copyable { auto static constexpr const src_host_accessible = is_host_mdspan_v; auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; + // Allowed copy codepaths auto static constexpr const can_use_device = std::conjunction_v; auto static constexpr const can_use_host = both_host_accessible; #if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) - auto static constexpr const can_use_simd = both_host_accessible; + auto static constexpr const can_use_simd = both_host_accessible && both_contiguous; # else auto static constexpr const can_use_simd = false; #endif + auto static constexpr const can_use_std_copy = std::conjunction_v< + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant + >; + auto static constexpr const can_use_raft_copy = std::conjunction_v< + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant + >; + auto static constexpr const can_use_cublas = std::conjunction_v< + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant + >; + + auto static constexpr const requires_intermediate = !both_host_accessible && !both_device_accessible && !can_use_raft_copy; + + auto static constexpr const use_intermediate_dst = std::conjunction_v< + std::bool_constant, + std::bool_constant + >; + + auto static constexpr const use_intermediate_src = std::conjunction_v< + std::bool_constant, + std::bool_constant + >; + + auto static constexpr const custom_kernel_allowed = std::conjunction_v< + std::bool_constant, + std::bool_constant, + std::bool_constant< + !(can_use_raft_copy || can_use_cublas) + > + >; + + auto static constexpr const custom_kernel_required = std::conjunction_v< + std::bool_constant, + std::bool_constant, + std::bool_constant< + !(can_use_raft_copy || can_use_cublas) + > + >; + // Viable overload? - using type = std::enable_if_t< - std::conjunction_v< - is_mdspan, - is_mdspan, - std::is_convertible, - std::bool_constant, - std::bool_constant - >, T + // TODO(wphicks): Detect case where custom kernel would be required AFTER + // transfer only + auto static constexpr const value = std::conjunction_v< + is_mdspan, + is_mdspan, +#ifndef __CUDACC__ + std::bool_constant, +#endif + std::bool_constant, + std::bool_constant >; + using type = std::enable_if_t; }; -// Need custom kernel if... +template +using mdspan_copyable_t = typename mdspan_copyable::type; template -struct mdspan_copy_requires_custom_kernel : std::conjunction< - // CUDA build is enabled... - std::bool_constant, - // and both mdspans can be accessed on device... - std::bool_constant, SrcType>>, - // and we cannot use cudaMemcpyAsync or cuBLAS. - std::bool_constant::value_type, typename SrcType::value_type>, - // and layout is contiguous... - std::conjunction< - std::disjunction< - std::is_same::layout_type, layout_c_contiguous>, - std::is_same::layout_type, layout_f_contiguous> - >, - std::disjunction< - std::is_same, - std::is_same - > - >, - // and EITHER... - std::disjunction< - // the mdspans have the same layout (cudaMemcpyAsync)... - std::is_same::layout_type, typename SrcType::layout_type>, - // OR the mdspans are 1D (in which case the underlying memory layout - // is actually the same... - std::bool_constant::extents_type::rank() == 1>, - // OR the data are a 2D matrix of either floats or doubles, in which - // case we can perform the transpose with cuBLAS - std::conjunction< - std::bool_constant::extents_type::rank() == 2>, - std::disjunction< - std::is_same::value_type, float>, - std::is_same::value_type, double> - > // end float or double check - > // end cuBLAS compatibility check - > // end cudaMemcpy || cuBLAS check - >> -> {}; - -template -auto constexpr mdspan_copy_requires_custom_kernel_v = mdspan_copy_requires_custom_kernel, SrcType>{}(); - +using mdspan_copyable_v = typename mdspan_copyable::value; template -std::enable_if_t< - std::conjunction_v, SrcType>, - std::is_convertible_v::element_type>, - std::remove_reference_t::extents_type::rank() == SrcType::extents_type::rank()>> +mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType const& src) { - using index_type = - std::conditional_t<(std::numeric_limits::extents_type::index_type>::max() > - std::numeric_limits::max()), - typename std::remove_reference_t::extents_type::index_type, - typename SrcType::extents_type::index_type>; - auto constexpr const both_contiguous = std::conjunction_v< - std::disjunction_v::layout_type, layout_c_contiguous>, - std::is_same_v::layout_type, layout_f_contiguous>>, - std::disjunction_v, - std::is_same_v>>; - auto constexpr const same_dtype = std::is_same_v::value_type, typename SrcType::value_type>; - auto constexpr const both_device_accessible = is_device_mdspan_v, SrcType>; - auto constexpr const both_host_accessible = is_host_mdspan_v, SrcType>; - auto constexpr const same_layout = std::is_same_v::layout_type, typename SrcType::layout_type>; - auto constexpr const can_use_device = std::conjunction_v; - - auto constexpr const both_float_or_double = - std::conjunction_v::value_type, float>, - std::is_same_v::value_type, double>>, - std::disjunction_v, - std::is_same_v>>; - - auto constexpr const simd_available = false; // TODO(wphicks) - // TODO(wphicks): If data are on different devices, perform a - // cudaMemcpyPeer and then call recursively - - if constexpr (!can_use_device) { - static_assert(both_host_accessible, - "Copying to/from non-host-accessible mdspan in non-CUDA-enabled build"); - } - + using config = mdspan_copyable; for (auto i = std::size_t{}; i < SrcType::extents_type::rank(); ++i) { RAFT_EXPECTS(src.extents(i) == dst.extents(i), "Must copy between mdspans of the same shape"); } - if constexpr (can_use_device) { + if constexpr(config::use_intermediate_src) { + // Copy to intermediate source on device, then perform necessary + // changes in layout on device, directly into final destination + auto intermediate = device_mdarray< + typename config::src_value_type, + typename config::src_extents_type, + typename config::src_layout_type + >(res, src.extents()); + copy(res, intermediate.view(), src); + copy(res, dst, intermediate.view()); + + } else if constexpr(config::use_intermediate_dst) { + // Perform necessary changes in layout on device, then copy to final + // destination on host + auto intermediate = device_mdarray< + typename config::dst_value_type, + typename config::dst_extents_type, + typename config::dst_layout_type + >(res, dst.extents()); + copy(res, intermediate.view(), src); + copy(res, dst, intermediate.view()); + } else if constexpr(config::can_use_raft_copy) { +#ifndef RAFT_DISABLE_CUDA + raft::copy( + dst.data_handle(), + src.data_handle(), + dst.size(), + resource::get_cuda_stream(res) + ); +#endif + } else if constexpr(config::can_use_cublas) { + auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; + auto constexpr const beta = typename std::remove_reference_t::value_type{0}; + CUBLAS_TRY(cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(0), + dst.extent(1), + &alpha, + src.data_handle(), + src.stride(0), + &beta, + static_cast::value_type*>(nullptr), + dst.stride(0), + dst.data_handle(), + dst.stride(0), + resource::get_cuda_stream(res))); + } else if constexpr(config::can_use_std_copy) { + std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); + } else if constexpr(config::can_use_simd) { + } else { + auto indices = std::array::extents_type::rank()>{}; + for (auto i = std::size_t{}; i < dst.size(); ++i) { + if constexpr (std::is_same_v::layout_type, layout_c_contiguous>) { + // For layout_right/layout_c_contiguous, we iterate over the + // rightmost extent fastest + auto dim = std::remove_reference_t::extents_type::rank(); + while ((indices[dim]++) == dst.extent(dim)) { + indices[dim] = index_type{}; + --dim; + } + } else { + // For layout_left/layout_f_contiguous (and currently all other + // layouts), we iterate over the leftmost extent fastest + + // TODO(wphicks): Add additional specialization for non-C/F + // arrays that have a stride of 1 in one dimension. This would + // be a performance enhancement; it is not required for + // correctness. + auto dim = std::size_t{}; + while ((indices[dim]++) == dst.extent(dim)) { + indices[dim] = index_type{}; + ++dim; + } + } + std::apply(dst, indices) = std::apply(src, indices); + } + } + + if constexpr (config::can_use_device) { #ifndef RAFT_DISABLE_CUDA if constexpr (same_dtype && (same_layout || std::remove_reference_t::extents_type::rank() == 1) && both_contiguous) { raft::copy( From a970dad23865a297550552c9f773339a17282fb8 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 1 Sep 2023 11:52:51 -0400 Subject: [PATCH 41/75] Correct detection of mdspan copy paths --- cpp/include/raft/core/detail/mdspan_copy.cuh | 4 +- cpp/include/raft/core/detail/mdspan_copy.hpp | 263 ++++++++++--------- cpp/include/raft/core/mdspan_copy.cuh | 9 +- cpp/include/raft/core/mdspan_copy.hpp | 11 +- 4 files changed, 147 insertions(+), 140 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.cuh b/cpp/include/raft/core/detail/mdspan_copy.cuh index 2cdde90e98..e54cc46dc5 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.cuh +++ b/cpp/include/raft/core/detail/mdspan_copy.cuh @@ -20,7 +20,7 @@ namespace raft { namespace detail { template -auto increment_indices(IdxType* indices, IdxType const* max_indices, int rank, int incr = 1) +__device__ auto increment_indices(IdxType* indices, IdxType const* max_indices, int rank, int incr = 1) { auto valid_index = true; auto dim = std::is_same_v ? rank : 0; @@ -46,7 +46,7 @@ template -auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, ResT... resolved_indices) +__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, ResT... resolved_indices) { if constexpr (remaining == IdxType{}) { return md(resolved_indices...); diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index b51dba95ee..1bca59d952 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -44,6 +44,15 @@ struct mdspan_copyable { using dst_type = std::remove_reference_t; using src_type = std::remove_reference_t; + // Extents properties + using dst_extents_type = typename dst_type::extents_type; + using src_extents_type = typename src_type::extents_type; + using index_type = + std::conditional_t<(std::numeric_limits::max() > + std::numeric_limits::max()), + typename dst_extents_type::index_type, + typename src_extents_type::index_type>; + // Dtype properties using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; @@ -62,8 +71,8 @@ struct mdspan_copyable { auto static constexpr const both_float_or_both_double = both_float || both_double; // Ranks - auto static constexpr const dst_rank = dst_type::extents_type::rank(); - auto static constexpr const src_rank = src_type::extents_type::rank(); + auto static constexpr const dst_rank = dst_extents_type::rank(); + auto static constexpr const src_rank = src_extents_type::rank(); auto static constexpr const compatible_rank = (dst_rank == src_rank); auto static constexpr const vector_rank = (dst_rank == 1); auto static constexpr const matrix_rank = (dst_rank == 2); @@ -90,6 +99,12 @@ struct mdspan_copyable { std::bool_constant, std::bool_constant >; + // Layout for intermediate tile if copying through custom kernel + using tile_layout_type = std::conditional_t< + src_contiguous, + src_layout_type, + std::conditional_t + >; // Accessibility @@ -102,9 +117,6 @@ struct mdspan_copyable { auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; // Allowed copy codepaths - auto static constexpr const can_use_device = std::conjunction_v; - - auto static constexpr const can_use_host = both_host_accessible; #if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) auto static constexpr const can_use_simd = both_host_accessible && both_contiguous; @@ -124,14 +136,6 @@ struct mdspan_copyable { std::bool_constant, std::bool_constant >; - auto static constexpr const can_use_cublas = std::conjunction_v< - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant - >; auto static constexpr const requires_intermediate = !both_host_accessible && !both_device_accessible && !can_use_raft_copy; @@ -144,10 +148,20 @@ struct mdspan_copyable { std::bool_constant, std::bool_constant >; + auto static constexpr const can_use_device = std::conjunction_v>; + + auto static constexpr const can_use_host = both_host_accessible; + auto static constexpr const can_use_cublas = std::conjunction_v< + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant + >; auto static constexpr const custom_kernel_allowed = std::conjunction_v< std::bool_constant, - std::bool_constant, std::bool_constant< !(can_use_raft_copy || can_use_cublas) > @@ -155,7 +169,6 @@ struct mdspan_copyable { auto static constexpr const custom_kernel_required = std::conjunction_v< std::bool_constant, - std::bool_constant, std::bool_constant< !(can_use_raft_copy || can_use_cublas) > @@ -165,13 +178,8 @@ struct mdspan_copyable { // TODO(wphicks): Detect case where custom kernel would be required AFTER // transfer only auto static constexpr const value = std::conjunction_v< - is_mdspan, - is_mdspan, -#ifndef __CUDACC__ - std::bool_constant, -#endif - std::bool_constant, - std::bool_constant + is_mdspan_v, + std::disjunction_v >; using type = std::enable_if_t; }; @@ -181,12 +189,109 @@ using mdspan_copyable_t = typename mdspan_copyable::t template using mdspan_copyable_v = typename mdspan_copyable::value; +#ifdef __CUDACC__ +template +__device__ auto increment_indices(IdxType* indices, IdxType const* max_indices, int rank, int incr = 1) +{ + auto valid_index = true; + auto dim = std::is_same_v ? rank : 0; + do { + indices[dim] += incr; + incr = 0; + while (indices[dim] >= max_indices[dim]) { + indices[dim] -= max_indices[dim]; + ++incr; + } + if constexpr (std::is_same_v) { + --dim; + valid_index = dim >= 0; + } else { + ++dim; + valid_index = dim < rank; + } + } while (incr != 0); + return valid_index; +} + +template +__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, ResT... resolved_indices) +{ + if constexpr (remaining == IdxType{}) { + return md(resolved_indices...); + } else { + return get_mdspan_elem( + md, indices, indices[remaining - 1], &resolved_indices...); + } +} + +template +__global__ std::enable_if_t< + mdspan_copyable_v::custom_kernel_allowed +> mdspan_device_copy(DstType dst, SrcType src) +{ + using config = mdspan_copyable; + + __shared__ config::dst_value_type tile_buffer[TileDim][TileDim + 1]; + auto tile = mdspan{tile_buffer} + + auto const constexpr tile_elements = TileDim * TileDim; + index_type src_indices[config::dst_rank] = {blockIdx.x * tile_elements}; + index_type dst_indices[config::dst_rank] = {blockIdx.x * tile_elements}; + index_type max_indices[config::dst_rank]; + for (auto i = index_type{}; i < config::dst_rank; ++i) { + max_indices[i] = dst.extent(i); + } + + auto valid_indices = true; + for (auto i = blockIdx.x * tile_elements; i += tile_elements * blockDim.x; i < dst.size()) { + for (auto tile_slow = threadIdx.y; tile_slow += gridDim.y; tile_slow < TileDim) { + for (auto tile_quick = threadIdx.x; tile_quick += gridDim.x; tile_quick < TileDim) { + if (valid_indices) { + if constexpr (std::is_same_v) { + tile(tile_slow, tile_quick) = get_mdspan_elem(src, src_indices); + } else { + tile(tile_quick, tile_slow) = get_mdspan_elem(src, src_indices); + } + } + valid_indices &= + increment_indices(src_indices, max_indices, gridDim.x); + } + valid_indices &= + increment_indices(src_indices, max_indices, gridDim.y * TileDim); + } + if constexpr (!std::is_same_v) { + __syncthreads(); + } + for (auto tile_slow = threadIdx.y; tile_slow += gridDim.y; tile_slow < TileDim) { + for (auto tile_quick = threadIdx.x; tile_quick += gridDim.x; tile_quick < TileDim) { + if (valid_indices) { + if constexpr (std::is_same_v) { + get_mdspan_elem(dst, dst_indices) = tile(tile_slow, tile_quick) + } else { + get_mdspan_elem(dst, dst_indices) = tile(tile_quick, tile_slow) + } + } + increment_indices(dst_indices, max_indices, gridDim.x); + } + increment_indices(dst_indices, max_indices, gridDim.y * TileDim); + } + valid_indices &= increment_indices( + src_indices, max_indices, blockDim.x * tile_elements); + increment_indices(dst_indices, max_indices, blockDim.x * tile_elements); + __syncthreads(); + } +} +#endif + template mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType const& src) { using config = mdspan_copyable; - for (auto i = std::size_t{}; i < SrcType::extents_type::rank(); ++i) { + for (auto i = std::size_t{}; i < config::src_extent_types::rank(); ++i) { RAFT_EXPECTS(src.extents(i) == dst.extents(i), "Must copy between mdspans of the same shape"); } @@ -237,18 +342,26 @@ copy(resources const& res, DstType&& dst, SrcType const& src) dst.data_handle(), dst.stride(0), resource::get_cuda_stream(res))); + } else if constexpr(config::custom_kernel_allowed) { +#ifdef __CUDACC__ + // TODO(wphicks): Determine sensible kernel launch parameters + mdspan_device_copy<<<32, 1024, 0, resource::get_cuda_stream(res)>>>(dst, src); +#else + // Should never actually reach this because of enable_ifs + RAFT_FAIL("raft::copy called in a way that requires custom kernel. Please use raft/core/mdspan_copy.cuh and include the header in a .cu file"); +#endif } else if constexpr(config::can_use_std_copy) { std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); } else if constexpr(config::can_use_simd) { } else { - auto indices = std::array::extents_type::rank()>{}; + auto indices = std::array{}; for (auto i = std::size_t{}; i < dst.size(); ++i) { - if constexpr (std::is_same_v::layout_type, layout_c_contiguous>) { + if constexpr (std::is_same_v) { // For layout_right/layout_c_contiguous, we iterate over the // rightmost extent fastest - auto dim = std::remove_reference_t::extents_type::rank(); + auto dim = config::dst_rank; while ((indices[dim]++) == dst.extent(dim)) { - indices[dim] = index_type{}; + indices[dim] = typename config::index_type{}; --dim; } } else { @@ -261,107 +374,13 @@ copy(resources const& res, DstType&& dst, SrcType const& src) // correctness. auto dim = std::size_t{}; while ((indices[dim]++) == dst.extent(dim)) { - indices[dim] = index_type{}; + indices[dim] = typename config::index_type{}; ++dim; } } std::apply(dst, indices) = std::apply(src, indices); } } - - if constexpr (config::can_use_device) { -#ifndef RAFT_DISABLE_CUDA - if constexpr (same_dtype && (same_layout || std::remove_reference_t::extents_type::rank() == 1) && both_contiguous) { - raft::copy( - dst.data_handle(), - src.data_handle(), - dst.size(), - resource::get_cuda_stream(res) - ); - } else if constexpr (same_dtype && both_float_or_double && both_contiguous && - std::remove_reference_t::extents_type::rank() == 2) { - auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; - auto constexpr const beta = typename std::remove_reference_t::value_type{0}; - CUBLAS_TRY(cublasgeam(resource::get_cublas_handle(res), - CUBLAS_OP_T, - CUBLAS_OP_N, - dst.extent(0), - dst.extent(1), - &alpha, - src.data_handle(), - src.stride(0), - &beta, - static_cast::value_type*>(nullptr), - dst.stride(0), - dst.data_handle(), - dst.stride(0), - resource::get_cuda_stream(res))); - } else { -#ifdef __CUDACC__ - // TODO(wphicks): Call kernel here -#else - // Ordinarily, we would just make this a .cuh file, but we do not want - // to signal that it *must* be built with CUDA. Instead, if this header - // is used in a way that requires a CUDA compiler, we fail with an - // informative error message. - static_assert( - !mdspan_copy_requires_custom_kernel_v, SrcType>, - "Selected instantiation of raft::copy requires nvcc compilation. Use raft/core/mdspan_copy.cuh instead of raft/core/mdspan_copy.hpp and #include it in a .cu file. The corresponding 'detail' headers should not be included anywhere else directly." - ); -#endif - } -#endif - } else if constexpr (both_host_accessible) { - if constexpr ((same_layout || std::remove_reference_t::extents_type::rank() == 1) && both_contiguous) { - // Use STL if possible; this should be well optimized - std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); - } else { - // TODO (wphicks): Use SIMD for both_contiguous && - // both_float_or_double - - // Finally, copy elements one by one, trying at least to perform - // cache-friendly reads - - auto indices = std::array::extents_type::rank()>{}; - for (auto i = std::size_t{}; i < dst.size(); ++i) { - if constexpr (std::is_same_v::layout_type, layout_c_contiguous>) { - // For layout_right/layout_c_contiguous, we iterate over the - // rightmost extent fastest - auto dim = std::remove_reference_t::extents_type::rank(); - while ((indices[dim]++) == dst.extent(dim)) { - indices[dim] = index_type{}; - --dim; - } - } else { - // For layout_left/layout_f_contiguous (and currently all other - // layouts), we iterate over the leftmost extent fastest - - // TODO(wphicks): Add additional specialization for non-C/F - // arrays that have a stride of 1 in one dimension. This would - // be a performance enhancement; it is not required for - // correctness. - auto dim = std::size_t{}; - while ((indices[dim]++) == dst.extent(dim)) { - indices[dim] = index_type{}; - ++dim; - } - } - std::apply(dst, indices) = std::apply(src, indices); - } - } - } else { -#ifndef RAFT_DISABLE_CUDA - if constexpr (same_dtype && same_layout && both_contiguous) { - raft::copy(dst.data_handle(), src.data_handle(), dst.size()); - } else if constexpr (is_device_mdspan_v>) { - // Copy to device memory and call recursively - } else { - // Copy to host memory and call recursively - } -#else - RAFT_FAIL("mdspan copy required device access in non-CUDA build"); -#endif - } } } // namespace detail } // namespace raft diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh index 93cf853c9c..b9a5c67084 100644 --- a/cpp/include/raft/core/mdspan_copy.cuh +++ b/cpp/include/raft/core/mdspan_copy.cuh @@ -6,15 +6,10 @@ #include #include namespace raft { - template std::enable_if_t< - std::conjunction_v< - std::bool_constant>, - detail::mdspan_copy_requires_custom_kernel, - std::is_convertible, - std::bool_constant - > + detail::mdspan_copyable::custom_kernel_allowed, + detail::mdspan_copyable_t > copy(resources const& res, DstType&& dst, SrcType const& src) { detail::copy(res, dst, src); } diff --git a/cpp/include/raft/core/mdspan_copy.hpp b/cpp/include/raft/core/mdspan_copy.hpp index 166a6ec547..bb28ec1ba2 100644 --- a/cpp/include/raft/core/mdspan_copy.hpp +++ b/cpp/include/raft/core/mdspan_copy.hpp @@ -1,19 +1,12 @@ #pragma once -#include #include -#include -#include #include namespace raft { template std::enable_if_t< - std::conjunction_v< - std::bool_constant, SrcType>>, - std::bool_constant, SrcType>>, - std::is_convertible::element_type>, - std::bool_constant::extents_type::rank() == SrcType::extents_type::rank()> - > + !detail::mdspan_copyable::custom_kernel_allowed, + detail::mdspan_copyable_t > copy(resources const& res, DstType&& dst, SrcType const& src) { detail::copy(res, dst, src); } From 9a2fa9ea96e857d10af0b8264906f47218085132 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 1 Sep 2023 13:18:21 -0400 Subject: [PATCH 42/75] Correct build errors --- cpp/include/raft/core/detail/mdspan_copy.hpp | 31 +++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 1bca59d952..35c66cb9c4 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -84,13 +84,13 @@ struct mdspan_copyable { auto static constexpr const same_layout = std::is_same_v; auto static constexpr const src_contiguous = std::disjunction_v< - std::is_same_v, - std::is_same_v + std::is_same, + std::is_same >; auto static constexpr const dst_contiguous = std::disjunction_v< - std::is_same_v, - std::is_same_v + std::is_same, + std::is_same >; auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; @@ -117,9 +117,10 @@ struct mdspan_copyable { auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; // Allowed copy codepaths + auto static constexpr const can_use_host = both_host_accessible; #if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) - auto static constexpr const can_use_simd = both_host_accessible && both_contiguous; + auto static constexpr const can_use_simd = can_use_host && both_contiguous; # else auto static constexpr const can_use_simd = false; #endif @@ -148,9 +149,14 @@ struct mdspan_copyable { std::bool_constant, std::bool_constant >; - auto static constexpr const can_use_device = std::conjunction_v>; + auto static constexpr const can_use_device = std::conjunction_v< + std::bool_constant, + std::disjunction< + std::bool_constant, + std::bool_constant + > + >; - auto static constexpr const can_use_host = both_host_accessible; auto static constexpr const can_use_cublas = std::conjunction_v< std::bool_constant, std::bool_constant, @@ -178,8 +184,11 @@ struct mdspan_copyable { // TODO(wphicks): Detect case where custom kernel would be required AFTER // transfer only auto static constexpr const value = std::conjunction_v< - is_mdspan_v, - std::disjunction_v + std::bool_constant>, + std::disjunction< + std::bool_constant, + std::bool_constant + > >; using type = std::enable_if_t; }; @@ -291,8 +300,8 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType const& src) { using config = mdspan_copyable; - for (auto i = std::size_t{}; i < config::src_extent_types::rank(); ++i) { - RAFT_EXPECTS(src.extents(i) == dst.extents(i), "Must copy between mdspans of the same shape"); + for (auto i = std::size_t{}; i < config::src_rank; ++i) { + RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape"); } if constexpr(config::use_intermediate_src) { From eac9de6b3ff5743d81bd175fbe0c0d8a46388f29 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 1 Sep 2023 14:43:58 -0400 Subject: [PATCH 43/75] Provide passing 3D host transpose tests --- cpp/include/raft/core/detail/mdspan_copy.hpp | 54 +++++++++++------- cpp/test/core/mdspan_copy.cpp | 58 ++++++++++++++++++-- 2 files changed, 86 insertions(+), 26 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 35c66cb9c4..18d5a2e98c 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -15,10 +15,12 @@ */ #pragma once +#include #include #include #include #include +#include #include #include #include @@ -305,6 +307,7 @@ copy(resources const& res, DstType&& dst, SrcType const& src) } if constexpr(config::use_intermediate_src) { + RAFT_LOG_WARN("use_intermediate_src"); // Copy to intermediate source on device, then perform necessary // changes in layout on device, directly into final destination auto intermediate = device_mdarray< @@ -316,6 +319,7 @@ copy(resources const& res, DstType&& dst, SrcType const& src) copy(res, dst, intermediate.view()); } else if constexpr(config::use_intermediate_dst) { + RAFT_LOG_WARN("use_intermediate_dst"); // Perform necessary changes in layout on device, then copy to final // destination on host auto intermediate = device_mdarray< @@ -326,6 +330,7 @@ copy(resources const& res, DstType&& dst, SrcType const& src) copy(res, intermediate.view(), src); copy(res, dst, intermediate.view()); } else if constexpr(config::can_use_raft_copy) { + RAFT_LOG_WARN("can_use_raft_copy"); #ifndef RAFT_DISABLE_CUDA raft::copy( dst.data_handle(), @@ -335,6 +340,7 @@ copy(resources const& res, DstType&& dst, SrcType const& src) ); #endif } else if constexpr(config::can_use_cublas) { + RAFT_LOG_WARN("can_use_cublas"); auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; auto constexpr const beta = typename std::remove_reference_t::value_type{0}; CUBLAS_TRY(cublasgeam(resource::get_cublas_handle(res), @@ -352,6 +358,7 @@ copy(resources const& res, DstType&& dst, SrcType const& src) dst.stride(0), resource::get_cuda_stream(res))); } else if constexpr(config::custom_kernel_allowed) { + RAFT_LOG_WARN("custom_kernel_allowed"); #ifdef __CUDACC__ // TODO(wphicks): Determine sensible kernel launch parameters mdspan_device_copy<<<32, 1024, 0, resource::get_cuda_stream(res)>>>(dst, src); @@ -360,31 +367,36 @@ copy(resources const& res, DstType&& dst, SrcType const& src) RAFT_FAIL("raft::copy called in a way that requires custom kernel. Please use raft/core/mdspan_copy.cuh and include the header in a .cu file"); #endif } else if constexpr(config::can_use_std_copy) { + RAFT_LOG_WARN("can_use_std_copy"); std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); - } else if constexpr(config::can_use_simd) { + // } else if constexpr(config::can_use_simd) { + // RAFT_LOG_WARN("can_use_simd"); } else { + RAFT_LOG_WARN("Default host copy"); auto indices = std::array{}; for (auto i = std::size_t{}; i < dst.size(); ++i) { - if constexpr (std::is_same_v) { - // For layout_right/layout_c_contiguous, we iterate over the - // rightmost extent fastest - auto dim = config::dst_rank; - while ((indices[dim]++) == dst.extent(dim)) { - indices[dim] = typename config::index_type{}; - --dim; - } - } else { - // For layout_left/layout_f_contiguous (and currently all other - // layouts), we iterate over the leftmost extent fastest - - // TODO(wphicks): Add additional specialization for non-C/F - // arrays that have a stride of 1 in one dimension. This would - // be a performance enhancement; it is not required for - // correctness. - auto dim = std::size_t{}; - while ((indices[dim]++) == dst.extent(dim)) { - indices[dim] = typename config::index_type{}; - ++dim; + if (i != 0) { + if constexpr (std::is_same_v) { + // For layout_right/layout_c_contiguous, we iterate over the + // rightmost extent fastest + auto dim = config::src_rank - 1; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + --dim; + } + } else { + // For layout_left/layout_f_contiguous (and currently all other + // layouts), we iterate over the leftmost extent fastest + + // TODO(wphicks): Add additional specialization for non-C/F + // arrays that have a stride of 1 in one dimension. This would + // be a performance enhancement; it is not required for + // correctness. + auto dim = std::size_t{}; + while ((indices[dim]++) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + ++dim; + } } } std::apply(dst, indices) = std::apply(src, indices); diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp index 665f8afe75..9ee7850aec 100644 --- a/cpp/test/core/mdspan_copy.cpp +++ b/cpp/test/core/mdspan_copy.cpp @@ -46,7 +46,11 @@ TEST(MDSpanCopy, Mdspan3D) { auto constexpr depth = std::uint32_t{5}; auto constexpr rows = std::uint32_t{3}; auto constexpr cols = std::uint32_t{2}; - auto in = make_host_mdarray( + auto in_left = make_host_mdarray( + res, + extents{} + ); + auto in_right = make_host_mdarray( res, extents{} ); @@ -57,22 +61,66 @@ TEST(MDSpanCopy, Mdspan3D) { for (auto i=std::uint32_t{}; i < depth; ++i) { for (auto j=std::uint32_t{}; j < rows; ++j) { for (auto k=std::uint32_t{}; k < cols; ++k) { - in(i, j, k) = gen_unique_entry(i, j, k); + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); } } } - auto out_different_contiguous_layout = make_host_mdarray( + auto out_left = make_host_mdarray( res, extents{} ); - copy(res, out_different_contiguous_layout.view(), in.view()); + auto out_right = make_host_mdarray( + res, + extents{} + ); + + copy(res, out_right.view(), in_right.view()); + for (auto i=std::uint32_t{}; i < depth; ++i) { + for (auto j=std::uint32_t{}; j < rows; ++j) { + for (auto k=std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), + double(gen_unique_entry(i, j, k)), + CompareApprox{0.0001} + )); + } + } + } + + copy(res, out_right.view(), in_left.view()); + for (auto i=std::uint32_t{}; i < depth; ++i) { + for (auto j=std::uint32_t{}; j < rows; ++j) { + for (auto k=std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), + double(gen_unique_entry(i, j, k)), + CompareApprox{0.0001} + )); + } + } + } + + copy(res, out_left.view(), in_right.view()); + for (auto i=std::uint32_t{}; i < depth; ++i) { + for (auto j=std::uint32_t{}; j < rows; ++j) { + for (auto k=std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), + double(gen_unique_entry(i, j, k)), + CompareApprox{0.0001} + )); + } + } + } + copy(res, out_left.view(), in_left.view()); for (auto i=std::uint32_t{}; i < depth; ++i) { for (auto j=std::uint32_t{}; j < rows; ++j) { for (auto k=std::uint32_t{}; k < cols; ++k) { ASSERT_TRUE(match( - out_different_contiguous_layout(i, j, k), + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001} )); From 39cf094de2300b4f70563702fc6740b2bd2ed236 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 1 Sep 2023 16:37:02 -0400 Subject: [PATCH 44/75] Add working tests for cuBlas based transpose --- cpp/include/raft/core/detail/mdspan_copy.hpp | 350 +++++++++---------- cpp/test/core/mdspan_copy.cpp | 272 ++++++++++---- 2 files changed, 379 insertions(+), 243 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 18d5a2e98c..eb83710396 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -26,20 +26,20 @@ #include #include #ifndef RAFT_DISABLE_CUDA -#include #include +#include #include #include - #ifdef __CUDACC__ +#ifdef __CUDACC__ #include - #endif +#endif #endif namespace raft { namespace detail { -template -struct mdspan_copyable{}; +template +struct mdspan_copyable {}; template struct mdspan_copyable { @@ -56,28 +56,29 @@ struct mdspan_copyable { typename src_extents_type::index_type>; // Dtype properties - using dst_value_type = typename dst_type::value_type; - using src_value_type = typename src_type::value_type; - using dst_element_type = typename dst_type::element_type; - using src_element_type = typename src_type::element_type; + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + using dst_element_type = typename dst_type::element_type; + using src_element_type = typename src_type::element_type; auto static constexpr const same_dtype = std::is_same_v; - auto static constexpr const compatible_dtype = std::is_convertible_v; + auto static constexpr const compatible_dtype = + std::is_convertible_v; - auto static constexpr const dst_float = std::is_same_v; - auto static constexpr const src_float = std::is_same_v; + auto static constexpr const dst_float = std::is_same_v; + auto static constexpr const src_float = std::is_same_v; auto static constexpr const dst_double = std::is_same_v; auto static constexpr const src_double = std::is_same_v; - auto static constexpr const both_float = dst_float && src_float; - auto static constexpr const both_double = dst_double && src_double; + auto static constexpr const both_float = dst_float && src_float; + auto static constexpr const both_double = dst_double && src_double; auto static constexpr const both_float_or_both_double = both_float || both_double; // Ranks - auto static constexpr const dst_rank = dst_extents_type::rank(); - auto static constexpr const src_rank = src_extents_type::rank(); + auto static constexpr const dst_rank = dst_extents_type::rank(); + auto static constexpr const src_rank = src_extents_type::rank(); auto static constexpr const compatible_rank = (dst_rank == src_rank); - auto static constexpr const vector_rank = (dst_rank == 1); - auto static constexpr const matrix_rank = (dst_rank == 2); + auto static constexpr const vector_rank = (dst_rank == 1); + auto static constexpr const matrix_rank = (dst_rank == 2); // Layout properties using dst_layout_type = typename dst_type::layout_type; @@ -85,37 +86,33 @@ struct mdspan_copyable { auto static constexpr const same_layout = std::is_same_v; - auto static constexpr const src_contiguous = std::disjunction_v< - std::is_same, - std::is_same - >; + auto static constexpr const src_contiguous = + std::disjunction_v, + std::is_same>; - auto static constexpr const dst_contiguous = std::disjunction_v< - std::is_same, - std::is_same - >; + auto static constexpr const dst_contiguous = + std::disjunction_v, + std::is_same>; auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; - auto static constexpr const same_underlying_layout = std::disjunction_v< - std::bool_constant, - std::bool_constant - >; + auto static constexpr const same_underlying_layout = + std::disjunction_v, + std::bool_constant>; // Layout for intermediate tile if copying through custom kernel - using tile_layout_type = std::conditional_t< - src_contiguous, - src_layout_type, - std::conditional_t - >; - + using tile_layout_type = + std::conditional_t>; // Accessibility auto static constexpr const dst_device_accessible = is_device_mdspan_v; auto static constexpr const src_device_accessible = is_device_mdspan_v; - auto static constexpr const both_device_accessible = dst_device_accessible && src_device_accessible; + auto static constexpr const both_device_accessible = + dst_device_accessible && src_device_accessible; - auto static constexpr const dst_host_accessible = is_host_mdspan_v; - auto static constexpr const src_host_accessible = is_host_mdspan_v; + auto static constexpr const dst_host_accessible = is_host_mdspan_v; + auto static constexpr const src_host_accessible = is_host_mdspan_v; auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; // Allowed copy codepaths @@ -123,86 +120,76 @@ struct mdspan_copyable { #if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) auto static constexpr const can_use_simd = can_use_host && both_contiguous; -# else +#else auto static constexpr const can_use_simd = false; #endif - auto static constexpr const can_use_std_copy = std::conjunction_v< - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant - >; - auto static constexpr const can_use_raft_copy = std::conjunction_v< - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant - >; - - auto static constexpr const requires_intermediate = !both_host_accessible && !both_device_accessible && !can_use_raft_copy; - - auto static constexpr const use_intermediate_dst = std::conjunction_v< - std::bool_constant, - std::bool_constant - >; - - auto static constexpr const use_intermediate_src = std::conjunction_v< - std::bool_constant, - std::bool_constant - >; - auto static constexpr const can_use_device = std::conjunction_v< - std::bool_constant, - std::disjunction< - std::bool_constant, - std::bool_constant - > - >; - - auto static constexpr const can_use_cublas = std::conjunction_v< - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant, - std::bool_constant - >; - - auto static constexpr const custom_kernel_allowed = std::conjunction_v< - std::bool_constant, - std::bool_constant< - !(can_use_raft_copy || can_use_cublas) - > - >; - - auto static constexpr const custom_kernel_required = std::conjunction_v< - std::bool_constant, - std::bool_constant< - !(can_use_raft_copy || can_use_cublas) - > - >; + auto static constexpr const can_use_std_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + auto static constexpr const can_use_raft_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + auto static constexpr const requires_intermediate = + !both_host_accessible && !both_device_accessible && !can_use_raft_copy; + + auto static constexpr const use_intermediate_dst = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const use_intermediate_src = + std::conjunction_v, + std::bool_constant>; + auto static constexpr const can_use_device = + std::conjunction_v, + std::disjunction< + std::bool_constant, + std::bool_constant, + std::bool_constant + > + >; + + auto static constexpr const can_use_cublas = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + auto static constexpr const custom_kernel_allowed = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const custom_kernel_required = + std::conjunction_v, + std::bool_constant>; // Viable overload? - // TODO(wphicks): Detect case where custom kernel would be required AFTER - // transfer only auto static constexpr const value = std::conjunction_v< - std::bool_constant>, - std::disjunction< - std::bool_constant, - std::bool_constant - > + std::bool_constant>, + std::bool_constant>, + std::bool_constant >; using type = std::enable_if_t; }; -template +template using mdspan_copyable_t = typename mdspan_copyable::type; template using mdspan_copyable_v = typename mdspan_copyable::value; #ifdef __CUDACC__ template -__device__ auto increment_indices(IdxType* indices, IdxType const* max_indices, int rank, int incr = 1) +__device__ auto increment_indices(IdxType* indices, + IdxType const* max_indices, + int rank, + int incr = 1) { auto valid_index = true; auto dim = std::is_same_v ? rank : 0; @@ -239,16 +226,18 @@ __device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, ResT... } template -__global__ std::enable_if_t< - mdspan_copyable_v::custom_kernel_allowed -> mdspan_device_copy(DstType dst, SrcType src) +__global__ std::enable_if_t::custom_kernel_allowed> +mdspan_device_copy(DstType dst, SrcType src) { using config = mdspan_copyable; __shared__ config::dst_value_type tile_buffer[TileDim][TileDim + 1]; - auto tile = mdspan{tile_buffer} + auto tile = mdspan < config::dst_value_type, extents + { + tile_buffer + } - auto const constexpr tile_elements = TileDim * TileDim; + auto const constexpr tile_elements = TileDim * TileDim; index_type src_indices[config::dst_rank] = {blockIdx.x * tile_elements}; index_type dst_indices[config::dst_rank] = {blockIdx.x * tile_elements}; index_type max_indices[config::dst_rank]; @@ -298,109 +287,120 @@ __global__ std::enable_if_t< #endif template -mdspan_copyable_t -copy(resources const& res, DstType&& dst, SrcType const& src) +mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType const& src) { using config = mdspan_copyable; for (auto i = std::size_t{}; i < config::src_rank; ++i) { RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape"); } - if constexpr(config::use_intermediate_src) { + if constexpr (config::use_intermediate_src) { RAFT_LOG_WARN("use_intermediate_src"); // Copy to intermediate source on device, then perform necessary // changes in layout on device, directly into final destination - auto intermediate = device_mdarray< - typename config::src_value_type, - typename config::src_extents_type, - typename config::src_layout_type - >(res, src.extents()); + auto intermediate = device_mdarray(res, src.extents()); copy(res, intermediate.view(), src); copy(res, dst, intermediate.view()); - } else if constexpr(config::use_intermediate_dst) { + } else if constexpr (config::use_intermediate_dst) { RAFT_LOG_WARN("use_intermediate_dst"); // Perform necessary changes in layout on device, then copy to final // destination on host - auto intermediate = device_mdarray< - typename config::dst_value_type, - typename config::dst_extents_type, - typename config::dst_layout_type - >(res, dst.extents()); + auto intermediate = device_mdarray(res, dst.extents()); copy(res, intermediate.view(), src); copy(res, dst, intermediate.view()); - } else if constexpr(config::can_use_raft_copy) { + } else if constexpr (config::can_use_raft_copy) { RAFT_LOG_WARN("can_use_raft_copy"); #ifndef RAFT_DISABLE_CUDA - raft::copy( - dst.data_handle(), - src.data_handle(), - dst.size(), - resource::get_cuda_stream(res) - ); + raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res)); #endif - } else if constexpr(config::can_use_cublas) { + } else if constexpr (config::can_use_cublas) { RAFT_LOG_WARN("can_use_cublas"); auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; auto constexpr const beta = typename std::remove_reference_t::value_type{0}; - CUBLAS_TRY(cublasgeam(resource::get_cublas_handle(res), - CUBLAS_OP_T, - CUBLAS_OP_N, - dst.extent(0), - dst.extent(1), - &alpha, - src.data_handle(), - src.stride(0), - &beta, - static_cast::value_type*>(nullptr), - dst.stride(0), - dst.data_handle(), - dst.stride(0), - resource::get_cuda_stream(res))); - } else if constexpr(config::custom_kernel_allowed) { + if constexpr (std::is_same_v) { + CUBLAS_TRY( + linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(1), + dst.extent(0), + &alpha, + src.data_handle(), + src.extent(0), + &beta, + dst.data_handle(), + dst.extent(1), + dst.data_handle(), + dst.extent(1), + resource::get_cuda_stream(res))); + } else { + CUBLAS_TRY( + linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(0), + dst.extent(1), + &alpha, + src.data_handle(), + src.extent(1), + &beta, + dst.data_handle(), + dst.extent(0), + dst.data_handle(), + dst.extent(0), + resource::get_cuda_stream(res))); + } + } else if constexpr (config::custom_kernel_allowed) { RAFT_LOG_WARN("custom_kernel_allowed"); #ifdef __CUDACC__ // TODO(wphicks): Determine sensible kernel launch parameters mdspan_device_copy<<<32, 1024, 0, resource::get_cuda_stream(res)>>>(dst, src); #else // Should never actually reach this because of enable_ifs - RAFT_FAIL("raft::copy called in a way that requires custom kernel. Please use raft/core/mdspan_copy.cuh and include the header in a .cu file"); + RAFT_FAIL( + "raft::copy called in a way that requires custom kernel. Please use " + "raft/core/mdspan_copy.cuh and include the header in a .cu file"); #endif - } else if constexpr(config::can_use_std_copy) { + } else if constexpr (config::can_use_std_copy) { RAFT_LOG_WARN("can_use_std_copy"); std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); - // } else if constexpr(config::can_use_simd) { - // RAFT_LOG_WARN("can_use_simd"); + // } else if constexpr(config::can_use_simd) { + // RAFT_LOG_WARN("can_use_simd"); } else { - RAFT_LOG_WARN("Default host copy"); - auto indices = std::array{}; - for (auto i = std::size_t{}; i < dst.size(); ++i) { - if (i != 0) { - if constexpr (std::is_same_v) { - // For layout_right/layout_c_contiguous, we iterate over the - // rightmost extent fastest - auto dim = config::src_rank - 1; - while ((++indices[dim]) == src.extent(dim)) { - indices[dim] = typename config::index_type{}; - --dim; - } - } else { - // For layout_left/layout_f_contiguous (and currently all other - // layouts), we iterate over the leftmost extent fastest - - // TODO(wphicks): Add additional specialization for non-C/F - // arrays that have a stride of 1 in one dimension. This would - // be a performance enhancement; it is not required for - // correctness. - auto dim = std::size_t{}; - while ((indices[dim]++) == src.extent(dim)) { - indices[dim] = typename config::index_type{}; - ++dim; - } + RAFT_LOG_WARN("Default host copy"); + auto indices = std::array{}; + for (auto i = std::size_t{}; i < dst.size(); ++i) { + if (i != 0) { + if constexpr (std::is_same_v) { + // For layout_right/layout_c_contiguous, we iterate over the + // rightmost extent fastest + auto dim = config::src_rank - 1; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + --dim; + } + } else { + // For layout_left/layout_f_contiguous (and currently all other + // layouts), we iterate over the leftmost extent fastest + + // TODO(wphicks): Add additional specialization for non-C/F + // arrays that have a stride of 1 in one dimension. This would + // be a performance enhancement; it is not required for + // correctness. + auto dim = std::size_t{}; + while ((indices[dim]++) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + ++dim; } } - std::apply(dst, indices) = std::apply(src, indices); } + std::apply(dst, indices) = std::apply(src, indices); + } } } } // namespace detail diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp index 9ee7850aec..bacc1a67f4 100644 --- a/cpp/test/core/mdspan_copy.cpp +++ b/cpp/test/core/mdspan_copy.cpp @@ -14,119 +14,255 @@ * limitations under the License. */ +#include "../test_utils.h" #include #include -#include -#include #include -#include "../test_utils.h" +#include +#include +#include namespace raft { -TEST(MDSpanCopy, Mdspan1D) { - auto res = device_resources{}; +TEST(MDSpanCopy, Mdspan1DHostHost) +{ + auto res = device_resources{}; auto cols = std::uint32_t{2}; - auto in = make_host_vector(res, cols); + auto in_left = make_host_vector(res, cols); - auto gen_unique_entry = [](auto&& x) { - return x; - }; - for (auto i=std::uint32_t{}; i < cols; ++i) { - in(i) = gen_unique_entry(i); + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); } - auto out_different_contiguous_layout = make_host_vector(res, cols); - copy(res, out_different_contiguous_layout.view(), in.view()); - for (auto i=std::uint32_t{}; i < cols; ++i) { - ASSERT_TRUE(match(out_different_contiguous_layout(i), double(gen_unique_entry(i)), CompareApprox{0.0001})); + auto out_right = make_host_vector(res, cols); + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE(match(out_right(i), + double(gen_unique_entry(i)), + CompareApprox{0.0001})); } } -TEST(MDSpanCopy, Mdspan3D) { - auto res = device_resources{}; +TEST(MDSpanCopy, Mdspan1DHostDevice) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_device_vector(res, cols); + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE(match(float(out_right(i)), + float(gen_unique_entry(i)), + CompareApprox{0.0001f})); + } +} + +TEST(MDSpanCopy, Mdspan1DDeviceHost) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_device_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_host_vector(res, cols); + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE(match(float(out_right(i)), + float(gen_unique_entry(i)), + CompareApprox{0.0001f})); + } +} + +TEST(MDSpanCopy, Mdspan3DHostHost) +{ + auto res = device_resources{}; auto constexpr depth = std::uint32_t{5}; - auto constexpr rows = std::uint32_t{3}; - auto constexpr cols = std::uint32_t{2}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; auto in_left = make_host_mdarray( - res, - extents{} - ); + res, extents{}); auto in_right = make_host_mdarray( - res, - extents{} - ); - auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { - return x * 7 + y * 11 + z * 13; - }; - - for (auto i=std::uint32_t{}; i < depth; ++i) { - for (auto j=std::uint32_t{}; j < rows; ++j) { - for (auto k=std::uint32_t{}; k < cols; ++k) { - in_left(i, j, k) = gen_unique_entry(i, j, k); + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); in_right(i, j, k) = gen_unique_entry(i, j, k); } } } auto out_left = make_host_mdarray( - res, - extents{} - ); + res, extents{}); auto out_right = make_host_mdarray( - res, - extents{} - ); + res, extents{}); copy(res, out_right.view(), in_right.view()); - for (auto i=std::uint32_t{}; i < depth; ++i) { - for (auto j=std::uint32_t{}; j < rows; ++j) { - for (auto k=std::uint32_t{}; k < cols; ++k) { + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { ASSERT_TRUE(match( - out_right(i, j, k), - double(gen_unique_entry(i, j, k)), - CompareApprox{0.0001} - )); + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); } } } copy(res, out_right.view(), in_left.view()); - for (auto i=std::uint32_t{}; i < depth; ++i) { - for (auto j=std::uint32_t{}; j < rows; ++j) { - for (auto k=std::uint32_t{}; k < cols; ++k) { + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { ASSERT_TRUE(match( - out_right(i, j, k), - double(gen_unique_entry(i, j, k)), - CompareApprox{0.0001} - )); + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); } } } copy(res, out_left.view(), in_right.view()); - for (auto i=std::uint32_t{}; i < depth; ++i) { - for (auto j=std::uint32_t{}; j < rows; ++j) { - for (auto k=std::uint32_t{}; k < cols; ++k) { + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { ASSERT_TRUE(match( - out_left(i, j, k), - double(gen_unique_entry(i, j, k)), - CompareApprox{0.0001} - )); + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); } } } copy(res, out_left.view(), in_left.view()); - for (auto i=std::uint32_t{}; i < depth; ++i) { - for (auto j=std::uint32_t{}; j < rows; ++j) { - for (auto k=std::uint32_t{}; k < cols; ++k) { + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { ASSERT_TRUE(match( - out_left(i, j, k), - double(gen_unique_entry(i, j, k)), - CompareApprox{0.0001} - )); + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } +} + +TEST(MDSpanCopy, Mdspan3DHostDevice) +{ + auto res = device_resources{}; + auto constexpr depth = std::uint32_t{5}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); } } } + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + copy(res, out_right.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + float(out_right(i, j, k)), float(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + /* copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } */ + + /* copy(res, out_left.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } */ + + copy(res, out_left.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + float(out_left(i, j, k)), float(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceDevice) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + copy(res, out_right.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + copy(res, out_left.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } } -} // namespace raft + +} // namespace raft From 760b6561fed90ef41c9111f8d8f4ca7438b8f77a Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 5 Sep 2023 11:59:36 -0400 Subject: [PATCH 45/75] Add incomplete kernel tests --- cpp/include/raft/core/detail/mdspan_copy.hpp | 17 +-- cpp/include/raft/core/mdspan_copy.cuh | 12 +- cpp/test/CMakeLists.txt | 1 + cpp/test/core/mdspan_copy.cpp | 104 ++++++++++++++- cpp/test/core/mdspan_copy.cu | 131 +++++++++++++++++++ 5 files changed, 241 insertions(+), 24 deletions(-) create mode 100644 cpp/test/core/mdspan_copy.cu diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index eb83710396..62a91806d0 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -30,9 +30,6 @@ #include #include #include -#ifdef __CUDACC__ -#include -#endif #endif namespace raft { @@ -231,17 +228,17 @@ mdspan_device_copy(DstType dst, SrcType src) { using config = mdspan_copyable; - __shared__ config::dst_value_type tile_buffer[TileDim][TileDim + 1]; - auto tile = mdspan < config::dst_value_type, extents + __shared__ typename config::dst_value_type tile_buffer[TileDim][TileDim + 1]; + auto tile = mdspan > { tile_buffer - } + }; auto const constexpr tile_elements = TileDim * TileDim; - index_type src_indices[config::dst_rank] = {blockIdx.x * tile_elements}; - index_type dst_indices[config::dst_rank] = {blockIdx.x * tile_elements}; - index_type max_indices[config::dst_rank]; - for (auto i = index_type{}; i < config::dst_rank; ++i) { + typename config::index_type src_indices[config::dst_rank] = {blockIdx.x * tile_elements}; + typename config::index_type dst_indices[config::dst_rank] = {blockIdx.x * tile_elements}; + typename config::index_type max_indices[config::dst_rank]; + for (auto i = typename config::index_type{}; i < config::dst_rank; ++i) { max_indices[i] = dst.extent(i); } diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh index b9a5c67084..9889878f94 100644 --- a/cpp/include/raft/core/mdspan_copy.cuh +++ b/cpp/include/raft/core/mdspan_copy.cuh @@ -1,18 +1,12 @@ #pragma once -#include #include -#include -#include -#include -#include +// #include namespace raft { -template +/* template std::enable_if_t< detail::mdspan_copyable::custom_kernel_allowed, detail::mdspan_copyable_t > copy(resources const& res, DstType&& dst, SrcType const& src) { detail::copy(res, dst, src); -} - +} */ } // namespace raft - diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 11c4afae85..0707663536 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -132,6 +132,7 @@ if(BUILD_TESTS) test/core/nvtx.cpp test/core/mdarray.cu test/core/mdspan_copy.cpp + test/core/mdspan_copy.cu test/core/mdspan_utils.cu test/core/numpy_serializer.cu test/core/memory_type.cpp diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp index bacc1a67f4..a8e60ee848 100644 --- a/cpp/test/core/mdspan_copy.cpp +++ b/cpp/test/core/mdspan_copy.cpp @@ -35,6 +35,7 @@ TEST(MDSpanCopy, Mdspan1DHostHost) } auto out_right = make_host_vector(res, cols); + // std::copy copy(res, out_right.view(), in_left.view()); for (auto i = std::uint32_t{}; i < cols; ++i) { ASSERT_TRUE(match(out_right(i), @@ -54,8 +55,10 @@ TEST(MDSpanCopy, Mdspan1DHostDevice) in_left(i) = gen_unique_entry(i); } + // raft::copy auto out_right = make_device_vector(res, cols); copy(res, out_right.view(), in_left.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < cols; ++i) { ASSERT_TRUE(match(float(out_right(i)), float(gen_unique_entry(i)), @@ -74,8 +77,10 @@ TEST(MDSpanCopy, Mdspan1DDeviceHost) in_left(i) = gen_unique_entry(i); } + // raft::copy auto out_right = make_host_vector(res, cols); copy(res, out_right.view(), in_left.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < cols; ++i) { ASSERT_TRUE(match(float(out_right(i)), float(gen_unique_entry(i)), @@ -86,9 +91,9 @@ TEST(MDSpanCopy, Mdspan1DDeviceHost) TEST(MDSpanCopy, Mdspan3DHostHost) { auto res = device_resources{}; - auto constexpr depth = std::uint32_t{5}; - auto constexpr rows = std::uint32_t{3}; - auto constexpr cols = std::uint32_t{2}; + auto constexpr depth = std::uint32_t{500}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; auto in_left = make_host_mdarray( res, extents{}); auto in_right = make_host_mdarray( @@ -109,6 +114,7 @@ TEST(MDSpanCopy, Mdspan3DHostHost) auto out_right = make_host_mdarray( res, extents{}); + // std::copy copy(res, out_right.view(), in_right.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -119,6 +125,7 @@ TEST(MDSpanCopy, Mdspan3DHostHost) } } + // simd or custom logic copy(res, out_right.view(), in_left.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -129,6 +136,7 @@ TEST(MDSpanCopy, Mdspan3DHostHost) } } + // simd or custom logic copy(res, out_left.view(), in_right.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -139,6 +147,7 @@ TEST(MDSpanCopy, Mdspan3DHostHost) } } + // std::copy copy(res, out_left.view(), in_left.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -153,6 +162,8 @@ TEST(MDSpanCopy, Mdspan3DHostHost) TEST(MDSpanCopy, Mdspan3DHostDevice) { auto res = device_resources{}; + // Use smaller values here since host/device copy takes awhile. + // Non-trivial logic is tested in the other cases. auto constexpr depth = std::uint32_t{5}; auto constexpr rows = std::uint32_t{3}; auto constexpr cols = std::uint32_t{2}; @@ -176,7 +187,9 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) auto out_right = make_device_mdarray( res, extents{}); + // raft::copy copy(res, out_right.view(), in_right.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { @@ -187,6 +200,7 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) } /* copy(res, out_right.view(), in_left.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { @@ -197,6 +211,7 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) } */ /* copy(res, out_left.view(), in_right.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { @@ -206,7 +221,9 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) } } */ + // raft::copy copy(res, out_left.view(), in_left.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { @@ -220,8 +237,8 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) TEST(MDSpanCopy, Mdspan2DDeviceDevice) { auto res = device_resources{}; - auto constexpr rows = std::uint32_t{3}; - auto constexpr cols = std::uint32_t{2}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; auto in_left = make_device_mdarray( res, extents{}); auto in_right = make_device_mdarray( @@ -240,7 +257,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) auto out_right = make_device_mdarray( res, extents{}); + // raft::copy copy(res, out_right.view(), in_right.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( @@ -248,7 +267,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) } } + // cublas copy(res, out_right.view(), in_left.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( @@ -256,7 +277,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) } } + // cublas copy(res, out_left.view(), in_right.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( @@ -265,4 +288,75 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) } } +/* TEST(MDSpanCopy, Mdspan3DDeviceDevice) +{ + auto res = device_resources{}; + auto constexpr depth = std::uint32_t{50}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + // Custom kernel + copy(res, out_right.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + // Custom kernel + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + // Custom kernel + copy(res, out_left.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + // Custom kernel + copy(res, out_left.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } +} */ + } // namespace raft diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu new file mode 100644 index 0000000000..60b3e00d81 --- /dev/null +++ b/cpp/test/core/mdspan_copy.cu @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +/*TEST(MDSpanCopy, Mdspan3DHostDevice) +{ + auto res = device_resources{}; + // Use smaller values here since host/device copy takes awhile. + // Non-trivial logic is tested in the other cases. + auto constexpr depth = std::uint32_t{5}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } +}*/ + +/* TEST(MDSpanCopy, Mdspan2DDeviceDevice) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + // raft::copy + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // cublas + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // cublas + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} */ + +} // namespace raft From f8d435f2c1d8314adf812f9b76a7466930c97a57 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 5 Sep 2023 12:00:29 -0400 Subject: [PATCH 46/75] Remove old mdspan copy header --- cpp/include/raft/core/mdspan_copy.cuh | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 cpp/include/raft/core/mdspan_copy.cuh diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh deleted file mode 100644 index 9889878f94..0000000000 --- a/cpp/include/raft/core/mdspan_copy.cuh +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once -#include -// #include -namespace raft { -/* template -std::enable_if_t< - detail::mdspan_copyable::custom_kernel_allowed, - detail::mdspan_copyable_t -> copy(resources const& res, DstType&& dst, SrcType const& src) { - detail::copy(res, dst, src); -} */ -} // namespace raft From 4c4fbafc110c93426e7ff024bf2be82429c6f2d5 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 5 Sep 2023 12:01:02 -0400 Subject: [PATCH 47/75] Revert "Remove old mdspan copy header" This reverts commit f8d435f2c1d8314adf812f9b76a7466930c97a57. --- cpp/include/raft/core/mdspan_copy.cuh | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 cpp/include/raft/core/mdspan_copy.cuh diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh new file mode 100644 index 0000000000..9889878f94 --- /dev/null +++ b/cpp/include/raft/core/mdspan_copy.cuh @@ -0,0 +1,12 @@ +#pragma once +#include +// #include +namespace raft { +/* template +std::enable_if_t< + detail::mdspan_copyable::custom_kernel_allowed, + detail::mdspan_copyable_t +> copy(resources const& res, DstType&& dst, SrcType const& src) { + detail::copy(res, dst, src); +} */ +} // namespace raft From ad5c786154ce5bd30acb832af50038f0ba73ea8a Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 5 Sep 2023 12:01:22 -0400 Subject: [PATCH 48/75] Remove correct mdspan copy header --- cpp/include/raft/core/detail/mdspan_copy.cuh | 136 ------------------- 1 file changed, 136 deletions(-) delete mode 100644 cpp/include/raft/core/detail/mdspan_copy.cuh diff --git a/cpp/include/raft/core/detail/mdspan_copy.cuh b/cpp/include/raft/core/detail/mdspan_copy.cuh deleted file mode 100644 index e54cc46dc5..0000000000 --- a/cpp/include/raft/core/detail/mdspan_copy.cuh +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include - -namespace raft { -namespace detail { - -template -__device__ auto increment_indices(IdxType* indices, IdxType const* max_indices, int rank, int incr = 1) -{ - auto valid_index = true; - auto dim = std::is_same_v ? rank : 0; - do { - indices[dim] += incr; - incr = 0; - while (indices[dim] >= max_indices[dim]) { - indices[dim] -= max_indices[dim]; - ++incr; - } - if constexpr (std::is_same_v) { - --dim; - valid_index = dim >= 0; - } else { - ++dim; - valid_index = dim < rank; - } - } while (incr != 0); - return valid_index; -} - -template -__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, ResT... resolved_indices) -{ - if constexpr (remaining == IdxType{}) { - return md(resolved_indices...); - } else { - return get_mdspan_elem( - md, indices, indices[remaining - 1], &resolved_indices...); - } -} - -template -__global__ std::enable_if_t< - std::conjunction_v, - is_device_mdspan_v, - std::is_convertible_v>> -mdspan_device_copy(DstType dst, SrcType src) -{ - // Lay out shmem tile in same layout as source if it is contiguous. - // Otherwise, lay it out in same layout as destination if destination is - // contiguous. If neither are contiguous, just fall back to - // layout_right/layout_c_contiguous - using tile_layout_policy = std::conditional_v< - std::disjunction_v, - std::is_same_v>, - SrcType::layout_type, - std::conditional_v< - std::disjunction_v, - std::is_same_v>, - DstType::layout_type, - layout_c_contiguous>>; - __shared__ DstType::value_type tile_buffer[TileDim][TileDim + 1]; - auto tile = mdspan(tile_buffer); - - using index_type = - std::conditional_t<(std::numeric_limits::max() > - std::numeric_limits::max()), - typename DstType::extents::index_type, - typename SrcType::extents::index_type>; - auto const constexpr tile_elements = TileDim * TileDim; - index_type src_indices[DstType::extents::rank()] = {blockIdx.x * tile_elements}; - index_type dst_indices[DstType::extents::rank()] = {blockIdx.x * tile_elements}; - index_type max_indices[DstType::extents::rank()]; - for (auto i = index_type{}; i < DstType::extents::rank(); ++i) { - max_indices[i] = dst.extent(i); - } - - auto valid_indices = true; - for (auto i = blockIdx.x * tile_elements; i += tile_elements * blockDim.x; i < dst.size()) { - for (auto tile_slow = threadIdx.y; tile_slow += gridDim.y; tile_slow < TileDim) { - for (auto tile_quick = threadIdx.x; tile_quick += gridDim.x; tile_quick < TileDim) { - if (valid_indices) { - if constexpr (std::is_same_v) { - tile(tile_slow, tile_quick) = get_mdspan_elem(src, src_indices); - } else { - tile(tile_quick, tile_slow) = get_mdspan_elem(src, src_indices); - } - } - valid_indices &= - increment_indices(src_indices, max_indices, gridDim.x); - } - valid_indices &= - increment_indices(src_indices, max_indices, gridDim.y * TileDim); - } - if constexpr (!std::is_same_v) { - __syncthreads(); - } - for (auto tile_slow = threadIdx.y; tile_slow += gridDim.y; tile_slow < TileDim) { - for (auto tile_quick = threadIdx.x; tile_quick += gridDim.x; tile_quick < TileDim) { - if (valid_indices) { - if constexpr (std::is_same_v) { - get_mdspan_elem(dst, dst_indices) = tile(tile_slow, tile_quick) - } else { - get_mdspan_elem(dst, dst_indices) = tile(tile_quick, tile_slow) - } - } - increment_indices(dst_indices, max_indices, gridDim.x); - } - increment_indices(dst_indices, max_indices, gridDim.y * TileDim); - } - valid_indices &= increment_indices( - src_indices, max_indices, blockDim.x * tile_elements); - increment_indices(dst_indices, max_indices, blockDim.x * tile_elements); - __syncthreads(); - } -} - -} // namespace detail -} // namespace raft From 2e433ba4f8257628e5986047c798549d7277e80c Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 6 Sep 2023 19:12:24 -0400 Subject: [PATCH 49/75] Correct std::apply workaround in CUDA --- cpp/include/raft/core/detail/mdspan_copy.hpp | 246 ++++++++++++------- cpp/include/raft/core/mdspan_copy.cuh | 11 +- cpp/include/raft/core/mdspan_copy.hpp | 6 +- cpp/test/core/mdspan_copy.cu | 22 +- 4 files changed, 175 insertions(+), 110 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 62a91806d0..11109d8ac3 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -30,6 +30,9 @@ #include #include #include +#ifdef __CUDACC__ +#include +#endif #endif namespace raft { @@ -74,8 +77,8 @@ struct mdspan_copyable { auto static constexpr const dst_rank = dst_extents_type::rank(); auto static constexpr const src_rank = src_extents_type::rank(); auto static constexpr const compatible_rank = (dst_rank == src_rank); - auto static constexpr const vector_rank = (dst_rank == 1); - auto static constexpr const matrix_rank = (dst_rank == 2); + auto static constexpr const has_vector_rank = (dst_rank == 1); + auto static constexpr const has_matrix_rank = (dst_rank == 2); // Layout properties using dst_layout_type = typename dst_type::layout_type; @@ -95,7 +98,7 @@ struct mdspan_copyable { auto static constexpr const same_underlying_layout = std::disjunction_v, - std::bool_constant>; + std::bool_constant>; // Layout for intermediate tile if copying through custom kernel using tile_layout_type = std::conditional_t { std::bool_constant, std::bool_constant, std::bool_constant, - std::bool_constant, + std::bool_constant, std::bool_constant>; auto static constexpr const custom_kernel_allowed = @@ -179,106 +182,165 @@ struct mdspan_copyable { template using mdspan_copyable_t = typename mdspan_copyable::type; template -using mdspan_copyable_v = typename mdspan_copyable::value; +auto static constexpr const mdspan_copyable_v = mdspan_copyable::value; + +template +auto static constexpr const mdspan_copyable_with_kernel_v = mdspan_copyable::custom_kernel_allowed; +template +auto static constexpr const mdspan_uncopyable_with_kernel_v = !mdspan_copyable::custom_kernel_allowed; + + +template +using mdspan_copyable_with_kernel_t = std::enable_if_t, T>; + +template +using mdspan_uncopyable_with_kernel_t = std::enable_if_t, T>; #ifdef __CUDACC__ -template -__device__ auto increment_indices(IdxType* indices, - IdxType const* max_indices, - int rank, - int incr = 1) +auto static constexpr const mdspan_copy_tile_dim = 32; +auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; + +// Helper struct to work around lack of CUDA-native std::apply +template +struct index_sequence { +}; + +template +struct make_index_sequence : std::conditional_t< + N == IdxType{}, + index_sequence, + make_index_sequence> {}; + + +/* + * Given an mdspan and an array of indices, return a reference to the + * indicated element. + */ +template +__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, index_sequence) { - auto valid_index = true; - auto dim = std::is_same_v ? rank : 0; - do { - indices[dim] += incr; - incr = 0; - while (indices[dim] >= max_indices[dim]) { - indices[dim] -= max_indices[dim]; - ++incr; - } - if constexpr (std::is_same_v) { - --dim; - valid_index = dim >= 0; - } else { - ++dim; - valid_index = dim < rank; - } - } while (incr != 0); - return valid_index; + return md(indices[Idx]...); +} + +template +__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices) { + return get_mdspan_elem(md, indices, make_index_sequence{}); } +/* Advance old_indices forward by the number of mdspan elements specified + * by increment. Store the result in indices. Return true if the new + * indices are valid for the input mdspan. + */ template -__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, ResT... resolved_indices) -{ - if constexpr (remaining == IdxType{}) { - return md(resolved_indices...); - } else { - return get_mdspan_elem( - md, indices, indices[remaining - 1], &resolved_indices...); + typename IdxType> +__device__ auto increment_indices( + IdxType* indices, + MdspanType const& md, + IdxType const* old_indices, + IdxType const* index_strides, + IdxType increment +) { + auto constexpr init_dim = std::is_same_v ? IdxType{} :IdxType(MdspanType::rank() - 1); + auto constexpr final_dim = std::is_same_v ? IdxType{} : IdxType(MdspanType::rank() - 1); + + auto valid_index = true; +#pragma unroll + for ( + auto i = init_dim; + i != final_dim; + std::is_same_v ? --i : ++i + ) { + auto cur_index = old_indices[i]; + while (increment >= index_strides[i]) { + increment -= index_strides[i]; + ++cur_index; + } + indices[i] = cur_index; + valid_index &= cur_index < md.extent(i); } + + return valid_index; } -template -__global__ std::enable_if_t::custom_kernel_allowed> +/* + * WARNING: This kernel _must_ be launched with mdspan_copy_tile_dim x + * mdspan_copy_tile_dim threads per block. This restriction allows for + * additional optimizations at the expense of generalized launch + * parameters. + */ +template +__global__ mdspan_copyable_with_kernel_t mdspan_device_copy(DstType dst, SrcType src) { using config = mdspan_copyable; - __shared__ typename config::dst_value_type tile_buffer[TileDim][TileDim + 1]; - auto tile = mdspan > - { - tile_buffer - }; - - auto const constexpr tile_elements = TileDim * TileDim; - typename config::index_type src_indices[config::dst_rank] = {blockIdx.x * tile_elements}; - typename config::index_type dst_indices[config::dst_rank] = {blockIdx.x * tile_elements}; - typename config::index_type max_indices[config::dst_rank]; - for (auto i = typename config::index_type{}; i < config::dst_rank; ++i) { - max_indices[i] = dst.extent(i); + // An intermediate storage location for the data to be copied. + __shared__ typename config::dst_value_type tile[mdspan_copy_tile_dim][mdspan_copy_tile_dim + 1]; + + // Compute the cumulative product of extents in order from fastest to + // slowest varying extent + auto constexpr init_dim_fast = std::is_same_v ? typename config::index_type(config::src_rank - 1) : typename config::index_type{}; + auto constexpr final_dim_fast = std::is_same_v ? typename config::index_type{} : typename config::index_type(config::src_rank - 1); + typename config::index_type index_strides[config::dst_rank]; + auto cur_stride = typename config::index_type{1}; +#pragma unroll + for ( + auto i = init_dim_fast; + i != final_dim_fast; + std::is_same_v ? --i : ++i + ) { + index_strides[i] = cur_stride; + cur_stride *= src.extent(i); } - auto valid_indices = true; - for (auto i = blockIdx.x * tile_elements; i += tile_elements * blockDim.x; i < dst.size()) { - for (auto tile_slow = threadIdx.y; tile_slow += gridDim.y; tile_slow < TileDim) { - for (auto tile_quick = threadIdx.x; tile_quick += gridDim.x; tile_quick < TileDim) { - if (valid_indices) { - if constexpr (std::is_same_v) { - tile(tile_slow, tile_quick) = get_mdspan_elem(src, src_indices); - } else { - tile(tile_quick, tile_slow) = get_mdspan_elem(src, src_indices); - } - } - valid_indices &= - increment_indices(src_indices, max_indices, gridDim.x); + // The index of the first element in the mdspan which will be copied via + // the current tile for this block. + typename config::index_type tile_offset[config::dst_rank] = {0}; + typename config::index_type cur_indices[config::dst_rank]; + + while ( + increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + blockIdx.x * mdspan_copy_tile_elems + ) + ) { + auto tile_read_x = std::is_same_v ? threadIdx.x : threadIdx.y; + auto tile_read_y = std::is_same_v ? threadIdx.y : threadIdx.x; + + auto valid_index = increment_indices( + cur_indices, + src, + tile_offset, + index_strides, + tile_read_x * mdspan_copy_tile_dim + tile_read_y + ); + + if constexpr (config::same_underlying_layout || !config::dst_contiguous) { + if (valid_index) { + tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); + get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y]; + } + } else { + if (valid_index) { + tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } - valid_indices &= - increment_indices(src_indices, max_indices, gridDim.y * TileDim); - } - if constexpr (!std::is_same_v) { __syncthreads(); - } - for (auto tile_slow = threadIdx.y; tile_slow += gridDim.y; tile_slow < TileDim) { - for (auto tile_quick = threadIdx.x; tile_quick += gridDim.x; tile_quick < TileDim) { - if (valid_indices) { - if constexpr (std::is_same_v) { - get_mdspan_elem(dst, dst_indices) = tile(tile_slow, tile_quick) - } else { - get_mdspan_elem(dst, dst_indices) = tile(tile_quick, tile_slow) - } - } - increment_indices(dst_indices, max_indices, gridDim.x); + + valid_index = increment_indices( + cur_indices, + src, + tile_offset, + index_strides, + tile_read_y * mdspan_copy_tile_dim + tile_read_x + ); + if (valid_index) { + get_mdspan_elem(dst, static_cast(cur_indices)) = tile[tile_read_y][tile_read_x]; } - increment_indices(dst_indices, max_indices, gridDim.y * TileDim); + __syncthreads(); } - valid_indices &= increment_indices( - src_indices, max_indices, blockDim.x * tile_elements); - increment_indices(dst_indices, max_indices, blockDim.x * tile_elements); - __syncthreads(); } } #endif @@ -355,8 +417,18 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr } else if constexpr (config::custom_kernel_allowed) { RAFT_LOG_WARN("custom_kernel_allowed"); #ifdef __CUDACC__ - // TODO(wphicks): Determine sensible kernel launch parameters - mdspan_device_copy<<<32, 1024, 0, resource::get_cuda_stream(res)>>>(dst, src); + auto const blocks = std::min( + // This maximum is somewhat arbitrary. Could query the device to see + // how many blocks we could reasonably allow, but this is probably + // sufficient considering that this kernel will likely overlap with + // real computations for most use cases. + typename config::index_type{32}, + raft::ceildiv( + typename config::index_type(dst.size()), + typename config::index_type(mdspan_copy_tile_elems)) + ); + auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1}; + mdspan_device_copy<<>>(dst, src); #else // Should never actually reach this because of enable_ifs RAFT_FAIL( diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh index 9889878f94..cd92ceaf67 100644 --- a/cpp/include/raft/core/mdspan_copy.cuh +++ b/cpp/include/raft/core/mdspan_copy.cuh @@ -1,12 +1,9 @@ #pragma once #include -// #include namespace raft { -/* template -std::enable_if_t< - detail::mdspan_copyable::custom_kernel_allowed, - detail::mdspan_copyable_t -> copy(resources const& res, DstType&& dst, SrcType const& src) { +template +detail::mdspan_copyable_with_kernel_t +copy(resources const& res, DstType&& dst, SrcType const& src) { detail::copy(res, dst, src); -} */ +} } // namespace raft diff --git a/cpp/include/raft/core/mdspan_copy.hpp b/cpp/include/raft/core/mdspan_copy.hpp index bb28ec1ba2..7792a548db 100644 --- a/cpp/include/raft/core/mdspan_copy.hpp +++ b/cpp/include/raft/core/mdspan_copy.hpp @@ -4,10 +4,8 @@ namespace raft { template -std::enable_if_t< - !detail::mdspan_copyable::custom_kernel_allowed, - detail::mdspan_copyable_t -> copy(resources const& res, DstType&& dst, SrcType const& src) { +detail::mdspan_uncopyable_with_kernel_t +copy(resources const& res, DstType&& dst, SrcType const& src) { detail::copy(res, dst, src); } diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu index 60b3e00d81..0e1e0f5860 100644 --- a/cpp/test/core/mdspan_copy.cu +++ b/cpp/test/core/mdspan_copy.cu @@ -23,7 +23,7 @@ #include namespace raft { -/*TEST(MDSpanCopy, Mdspan3DHostDevice) +TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) { auto res = device_resources{}; // Use smaller values here since host/device copy takes awhile. @@ -31,9 +31,9 @@ namespace raft { auto constexpr depth = std::uint32_t{5}; auto constexpr rows = std::uint32_t{3}; auto constexpr cols = std::uint32_t{2}; - auto in_left = make_host_mdarray( + auto in_left = make_device_mdarray( res, extents{}); - auto in_right = make_host_mdarray( + auto in_right = make_device_mdarray( res, extents{}); auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; @@ -46,9 +46,9 @@ namespace raft { } } - auto out_left = make_device_mdarray( + auto out_left = make_device_mdarray( res, extents{}); - auto out_right = make_device_mdarray( + auto out_right = make_device_mdarray( res, extents{}); copy(res, out_right.view(), in_left.view()); @@ -56,23 +56,21 @@ namespace raft { for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + ASSERT_EQ(out_right(i, j, k), gen_unique_entry(i, j, k)); } } } - copy(res, out_left.view(), in_right.view()); + /* copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + ASSERT_EQ(out_left(i, j, k), gen_unique_entry(i, j, k)); } } - } -}*/ + } */ +} /* TEST(MDSpanCopy, Mdspan2DDeviceDevice) { From d669e42796ca6f38aff03b9f5d7e72fef85c6a8d Mon Sep 17 00:00:00 2001 From: William Hicks Date: Thu, 7 Sep 2023 18:58:43 -0400 Subject: [PATCH 50/75] Provide fully working copy kernel --- cpp/include/raft/core/detail/mdspan_copy.hpp | 232 ++++++++++---- cpp/test/core/mdspan_copy.cu | 302 +++++++++++++++++-- 2 files changed, 459 insertions(+), 75 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 11109d8ac3..4988933838 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -119,7 +119,10 @@ struct mdspan_copyable { auto static constexpr const can_use_host = both_host_accessible; #if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) - auto static constexpr const can_use_simd = can_use_host && both_contiguous; + // TODO(wphicks): Following should be only necessary restrictions. Test if + // perf actually improves once fully implemented. + // auto static constexpr const can_use_simd = can_use_host && both_contiguous && both_float_or_both_double; + auto static constexpr const can_use_simd = can_use_host && both_contiguous && both_float && has_matrix_rank; #else auto static constexpr const can_use_simd = false; #endif @@ -211,6 +214,18 @@ struct make_index_sequence : std::conditional_t< index_sequence, make_index_sequence> {}; +/* template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence) +{ + return lambda(args[Idx]...); +} + +template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args) +{ + return apply(std::forward(lambda), std::forward(args), make_index_sequence{}); +} */ + /* * Given an mdspan and an array of indices, return a reference to the @@ -240,26 +255,34 @@ __device__ auto increment_indices( IdxType const* index_strides, IdxType increment ) { - auto constexpr init_dim = std::is_same_v ? IdxType{} :IdxType(MdspanType::rank() - 1); - auto constexpr final_dim = std::is_same_v ? IdxType{} : IdxType(MdspanType::rank() - 1); +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + increment += index_strides[i] * old_indices[i]; + } - auto valid_index = true; #pragma unroll - for ( - auto i = init_dim; - i != final_dim; - std::is_same_v ? --i : ++i - ) { - auto cur_index = old_indices[i]; - while (increment >= index_strides[i]) { - increment -= index_strides[i]; + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + // Iterate through dimensions in order from slowest to fastest varying + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return MdspanType::rank() - ind - 1; + } else { + return ind; + } + }(i); + + auto cur_index = IdxType{}; + + // printf("pre-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], int(increment)); + while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) { + increment -= index_strides[real_index]; ++cur_index; } - indices[i] = cur_index; - valid_index &= cur_index < md.extent(i); + indices[real_index] = cur_index; } + // printf("post-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], int(increment)); - return valid_index; + return increment == IdxType{}; } /* @@ -279,34 +302,84 @@ mdspan_device_copy(DstType dst, SrcType src) // Compute the cumulative product of extents in order from fastest to // slowest varying extent - auto constexpr init_dim_fast = std::is_same_v ? typename config::index_type(config::src_rank - 1) : typename config::index_type{}; - auto constexpr final_dim_fast = std::is_same_v ? typename config::index_type{} : typename config::index_type(config::src_rank - 1); typename config::index_type index_strides[config::dst_rank]; auto cur_stride = typename config::index_type{1}; #pragma unroll - for ( - auto i = init_dim_fast; - i != final_dim_fast; - std::is_same_v ? --i : ++i - ) { - index_strides[i] = cur_stride; - cur_stride *= src.extent(i); + for (auto i = typename SrcType::extents_type::rank_type{}; i < config::src_rank; ++i) { + // Iterate through dimensions in order from fastest to slowest varying + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return config::src_rank - ind - 1; + } else { + return ind; + } + }(i); + + index_strides[real_index] = cur_stride; + cur_stride *= src.extent(real_index); } // The index of the first element in the mdspan which will be copied via // the current tile for this block. typename config::index_type tile_offset[config::dst_rank] = {0}; + /* // 0 0 0 + increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + typename config::index_type{0} + ); + // 1 0 0 + increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + typename config::index_type{1} + ); + // 2 0 0 + increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + typename config::index_type{1} + ); + // 3 0 0 + increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + typename config::index_type{1} + ); + // 4 0 0 + increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + typename config::index_type{1} + ); + // 0 1 0 + increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + typename config::index_type{1} + ); */ typename config::index_type cur_indices[config::dst_rank]; - - while ( - increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - blockIdx.x * mdspan_copy_tile_elems - ) - ) { + auto valid_tile = increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + blockIdx.x * mdspan_copy_tile_elems + ); + + while (valid_tile) { auto tile_read_x = std::is_same_v ? threadIdx.x : threadIdx.y; auto tile_read_y = std::is_same_v ? threadIdx.y : threadIdx.x; @@ -325,6 +398,7 @@ mdspan_device_copy(DstType dst, SrcType src) } } else { if (valid_index) { + // printf("read: %d %d %d -> %d %d: %d\n", cur_indices[0], cur_indices[1], cur_indices[2], tile_read_x, tile_read_y, int(get_mdspan_elem(src, cur_indices))); tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } __syncthreads(); @@ -337,10 +411,19 @@ mdspan_device_copy(DstType dst, SrcType src) tile_read_y * mdspan_copy_tile_dim + tile_read_x ); if (valid_index) { - get_mdspan_elem(dst, static_cast(cur_indices)) = tile[tile_read_y][tile_read_x]; + // printf("write: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], cur_indices[1], cur_indices[2], int(tile[tile_read_y][tile_read_x])); + get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; + // printf("final: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], cur_indices[1], cur_indices[2], int(get_mdspan_elem(dst, cur_indices))); } __syncthreads(); } + valid_tile = increment_indices( + tile_offset, + src, + tile_offset, + index_strides, + blockDim.x * mdspan_copy_tile_elems + ); } } #endif @@ -354,31 +437,41 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr } if constexpr (config::use_intermediate_src) { - RAFT_LOG_WARN("use_intermediate_src"); // Copy to intermediate source on device, then perform necessary // changes in layout on device, directly into final destination - auto intermediate = device_mdarray(res, src.extents()); - copy(res, intermediate.view(), src); - copy(res, dst, intermediate.view()); + using mdarray_t = device_mdarray< + typename config::src_value_type, + typename config::src_extents_type, + typename config::src_layout_type + >; + auto intermediate = mdarray_t( + res, + typename mdarray_t::mapping_type{src.extents()}, + typename mdarray_t::container_policy_type{} + ); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); } else if constexpr (config::use_intermediate_dst) { - RAFT_LOG_WARN("use_intermediate_dst"); // Perform necessary changes in layout on device, then copy to final // destination on host - auto intermediate = device_mdarray(res, dst.extents()); - copy(res, intermediate.view(), src); - copy(res, dst, intermediate.view()); + using mdarray_t = device_mdarray< + typename config::dst_value_type, + typename config::dst_extents_type, + typename config::dst_layout_type + >; + auto intermediate = mdarray_t( + res, + typename mdarray_t::mapping_type{dst.extents()}, + typename mdarray_t::container_policy_type{} + ); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); } else if constexpr (config::can_use_raft_copy) { - RAFT_LOG_WARN("can_use_raft_copy"); #ifndef RAFT_DISABLE_CUDA raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res)); #endif } else if constexpr (config::can_use_cublas) { - RAFT_LOG_WARN("can_use_cublas"); auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; auto constexpr const beta = typename std::remove_reference_t::value_type{0}; if constexpr (std::is_same_v) { @@ -415,7 +508,6 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr resource::get_cuda_stream(res))); } } else if constexpr (config::custom_kernel_allowed) { - RAFT_LOG_WARN("custom_kernel_allowed"); #ifdef __CUDACC__ auto const blocks = std::min( // This maximum is somewhat arbitrary. Could query the device to see @@ -436,12 +528,44 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr "raft/core/mdspan_copy.cuh and include the header in a .cu file"); #endif } else if constexpr (config::can_use_std_copy) { - RAFT_LOG_WARN("can_use_std_copy"); std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); - // } else if constexpr(config::can_use_simd) { - // RAFT_LOG_WARN("can_use_simd"); + } else if constexpr(config::can_use_simd) { + RAFT_LOG_WARN("can_use_simd"); +#ifdef __SSE__ + constexpr auto elem_per_vector = 4; // 4 floats per __m128 + + for (auto i = 0; i < src.extent(0); i += elem_per_vector) { + for (auto j = 0; j < src.extent(1); j += elem_per_vector) { + // Load a row of 4 floats from src into row0 + __m128 row0 = _mm_loadu_ps(&src(i, j)); + // Load the next row of 4 floats from src into row1 + __m128 row1 = _mm_loadu_ps(&src(i + 1, j)); + // Load another row of 4 floats from src into row2 + __m128 row2 = _mm_loadu_ps(&src(i + 2, j)); + // Load the final row of 4 floats from src into row3 + __m128 row3 = _mm_loadu_ps(&src(i + 3, j)); + + // Shuffle elements from row0 and row1. tmp0 holds elements (0,1) from both row0 and row1 + __m128 tmp0 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(1, 0, 1, 0)); + // Shuffle elements from row0 and row1. tmp2 holds elements (2,3) from both row0 and row1 + __m128 tmp2 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(3, 2, 3, 2)); + // Shuffle elements from row2 and row3. tmp1 holds elements (0,1) from both row2 and row3 + __m128 tmp1 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(1, 0, 1, 0)); + // Shuffle elements from row2 and row3. tmp3 holds elements (2,3) from both row2 and row3 + __m128 tmp3 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(3, 2, 3, 2)); + + // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into first row of dst. + _mm_storeu_ps(&dst(j, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0))); + // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into second row of dst. + _mm_storeu_ps(&dst(j + 1, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1))); + // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into third row of dst. + _mm_storeu_ps(&dst(j + 2, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(2, 0, 2, 0))); + // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into fourth row of dst. + _mm_storeu_ps(&dst(j + 3, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(3, 1, 3, 1))); + } + } +#endif } else { - RAFT_LOG_WARN("Default host copy"); auto indices = std::array{}; for (auto i = std::size_t{}; i < dst.size(); ++i) { if (i != 0) { diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu index 0e1e0f5860..817067f3d3 100644 --- a/cpp/test/core/mdspan_copy.cu +++ b/cpp/test/core/mdspan_copy.cu @@ -21,22 +21,22 @@ #include #include #include +#include namespace raft { TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) { auto res = device_resources{}; - // Use smaller values here since host/device copy takes awhile. - // Non-trivial logic is tested in the other cases. - auto constexpr depth = std::uint32_t{5}; - auto constexpr rows = std::uint32_t{3}; - auto constexpr cols = std::uint32_t{2}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; auto in_left = make_device_mdarray( res, extents{}); auto in_right = make_device_mdarray( res, extents{}); auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { @@ -45,7 +45,22 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) } } } + res.sync_stream(); + // Test dtype conversion without transpose + auto out_long = make_device_mdarray( + res, extents{}); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose auto out_left = make_device_mdarray( res, extents{}); auto out_right = make_device_mdarray( @@ -56,27 +71,27 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_EQ(out_right(i, j, k), gen_unique_entry(i, j, k)); + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); } } } - /* copy(res, out_left.view(), in_right.view()); + copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_EQ(out_left(i, j, k), gen_unique_entry(i, j, k)); + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); } } - } */ + } } -/* TEST(MDSpanCopy, Mdspan2DDeviceDevice) +TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) { auto res = device_resources{}; - auto constexpr rows = std::uint32_t{300}; - auto constexpr cols = std::uint32_t{200}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; auto in_left = make_device_mdarray( res, extents{}); auto in_right = make_device_mdarray( @@ -90,40 +105,285 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) } } - auto out_left = make_device_mdarray( + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); + } + } + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); + } + } +} +TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = make_host_mdarray( + res, extents{}); + RAFT_LOG_WARN("BEGIN dtype conversion without transpose"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + RAFT_LOG_WARN("END dtype conversion without transpose"); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + /* // Test transpose + auto out_left = make_host_mdarray( + res, extents{}); + auto out_right = make_host_mdarray( + res, extents{}); + + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } */ +} + +TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( res, extents{}); - auto out_right = make_device_mdarray( + auto out_right = make_device_mdarray( res, extents{}); - // raft::copy + res.sync_stream(); + + // Test dtype conversion without transpose copy(res, out_right.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); } } + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); + } + } +} + +TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = make_device_mdarray( + res, extents{}); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); - // cublas copy(res, out_right.view(), in_left.view()); res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); } } - // cublas + // Test dtype conversion with transpose + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); + } + } copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + double(out_left(i, j)), double(gen_unique_entry(i, j)), + CompareApprox{0.0001})); } } -} */ +} + } // namespace raft From ed663c854de819b9b4aa23b16c7a527a8e151fdd Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 11 Sep 2023 13:04:57 -0400 Subject: [PATCH 51/75] Begin adding SIMD support --- cpp/include/raft/core/detail/mdspan_copy.hpp | 123 ++++++++++++++++--- 1 file changed, 108 insertions(+), 15 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 4988933838..b3194fcf7b 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -33,6 +33,9 @@ #ifdef __CUDACC__ #include #endif +#ifdef __SSE__ +#include +#endif #endif namespace raft { @@ -192,13 +195,102 @@ auto static constexpr const mdspan_copyable_with_kernel_v = mdspan_copyable auto static constexpr const mdspan_uncopyable_with_kernel_v = !mdspan_copyable::custom_kernel_allowed; - template using mdspan_copyable_with_kernel_t = std::enable_if_t, T>; template using mdspan_uncopyable_with_kernel_t = std::enable_if_t, T>; +template +auto static constexpr const mdspan_copyable_with_simd_v = mdspan_copyable::can_use_simd; +template +using mdspan_copyable_with_simd_t = std::enable_if_t, T>; + +template +struct simd_type_2x2 { + struct type { + type(T val0, T val1, T val2, T val3) : data{val0, val1, val2, val3} {} + private: + std::array data; + }; + auto static load(T const* row0_ptr, T const* row1_ptr) { + return type{row0_ptr[0], row0_ptr[1], row1_ptr[2], row1_ptr[3]}; + } + auto static transpose(type data) { + return type{data[0], data[2], data[1], data[3]}; + } + void static store(type data, T* row0_ptr, T* row1_ptr) { + row0_ptr[0] = data[0]; + row0_ptr[1] = data[1]; + row1_ptr[0] = data[2]; + row1_ptr[1] = data[3]; + } +}; + +#ifdef __SSE__ +template<> +struct simd_type_2x2 { + using type=__m128; + auto static load(float const* row0_ptr, float const* row1_ptr) { + return _mm_set_ps(row1_ptr[1], row1_ptr[0], row0_ptr[1], row0_ptr[0]); + } + auto static transpose(type data) { + return _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 1, 2, 0)); + } + void static store(type data, float* row0_ptr, float* row1_ptr) { + } +}; +#endif + +#ifdef __AVX__ +template<> +struct simd_type_2x2 { + using type=__m256; + auto static load(double const* row0_ptr, double const* row1_ptr) { + __m128d row0 = _mm_loadu_pd(row0_ptr); + __m128d row1 = _mm_loadu_pd(row1_ptr); + return _mm256_set_m128d(row1, row0); + } + auto static transpose(type data) { + return _mm256_permute4x64_pd(data, _MM_SHUFFLE(3, 1, 2, 0)); + } +}; +#endif + +template ::type> +struct simd_matrix_2x2 { + using value_type = std::remove_cv_t; + + simd_matrix_2x2(T const* row0_ptr, T const* row1_ptr) : data{simd_type::load(row0_ptr, row1_ptr)} {} + + auto transpose() { + return simd_type:: + } + auto store(value_type* row0_ptr, value_type* row1_ptr) { + _mm_storeu_ps(row0_ptr, row0); + _mm_storeu_ps(row1_ptr, row1); + } + auto transpose(T* row0_ptr, T* row1_ptr) { + transpose().store(row0_ptr, row1_ptr); + } + + private: + simd_type data; +}; + +template +struct simd_matrix_mxn { + std::vector rows; + IdxT row_length; +}; + +template +mdspan_copyable_with_simd_t mdspan_host_copy(DstType&& dst, SrcType const& src) { + using config = mdspan_copyable; +} + + + #ifdef __CUDACC__ auto static constexpr const mdspan_copy_tile_dim = 32; auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; @@ -533,33 +625,34 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr RAFT_LOG_WARN("can_use_simd"); #ifdef __SSE__ constexpr auto elem_per_vector = 4; // 4 floats per __m128 - - for (auto i = 0; i < src.extent(0); i += elem_per_vector) { - for (auto j = 0; j < src.extent(1); j += elem_per_vector) { + auto i = 0; + for (; i < src.extent(0); i += elem_per_vector) { + auto j = 0; + for (; j < src.extent(1); j += elem_per_vector) { // Load a row of 4 floats from src into row0 - __m128 row0 = _mm_loadu_ps(&src(i, j)); + auto row0 = _mm_loadu_ps(&src(i, j)); // Load the next row of 4 floats from src into row1 - __m128 row1 = _mm_loadu_ps(&src(i + 1, j)); + auto row1 = _mm_loadu_ps(&src(i + 1, j)); // Load another row of 4 floats from src into row2 - __m128 row2 = _mm_loadu_ps(&src(i + 2, j)); + auto row2 = _mm_loadu_ps(&src(i + 2, j)); // Load the final row of 4 floats from src into row3 - __m128 row3 = _mm_loadu_ps(&src(i + 3, j)); + auto row3 = _mm_loadu_ps(&src(i + 3, j)); // Shuffle elements from row0 and row1. tmp0 holds elements (0,1) from both row0 and row1 - __m128 tmp0 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(1, 0, 1, 0)); + auto tmp0 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(1, 0, 1, 0)); // Shuffle elements from row0 and row1. tmp2 holds elements (2,3) from both row0 and row1 - __m128 tmp2 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(3, 2, 3, 2)); + auto tmp2 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(3, 2, 3, 2)); // Shuffle elements from row2 and row3. tmp1 holds elements (0,1) from both row2 and row3 - __m128 tmp1 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(1, 0, 1, 0)); + auto tmp1 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(1, 0, 1, 0)); // Shuffle elements from row2 and row3. tmp3 holds elements (2,3) from both row2 and row3 - __m128 tmp3 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(3, 2, 3, 2)); + auto tmp3 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(3, 2, 3, 2)); // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into first row of dst. - _mm_storeu_ps(&dst(j, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0))); + _mm_storeu_ps(&dst(i, j), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0))); // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into second row of dst. - _mm_storeu_ps(&dst(j + 1, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1))); + _mm_storeu_ps(&dst(i + 1, j), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1))); // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into third row of dst. - _mm_storeu_ps(&dst(j + 2, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(2, 0, 2, 0))); + _mm_storeu_ps(&dst(i + 2, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(2, 0, 2, 0))); // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into fourth row of dst. _mm_storeu_ps(&dst(j + 3, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(3, 1, 3, 1))); } From ab809e8c834049416de1a88ddf2d0c5e3476519e Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 11 Sep 2023 13:38:56 -0400 Subject: [PATCH 52/75] Revert "Begin adding SIMD support" This reverts commit ed663c854de819b9b4aa23b16c7a527a8e151fdd. --- cpp/include/raft/core/detail/mdspan_copy.hpp | 123 +++---------------- 1 file changed, 15 insertions(+), 108 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index b3194fcf7b..4988933838 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -33,9 +33,6 @@ #ifdef __CUDACC__ #include #endif -#ifdef __SSE__ -#include -#endif #endif namespace raft { @@ -195,102 +192,13 @@ auto static constexpr const mdspan_copyable_with_kernel_v = mdspan_copyable auto static constexpr const mdspan_uncopyable_with_kernel_v = !mdspan_copyable::custom_kernel_allowed; + template using mdspan_copyable_with_kernel_t = std::enable_if_t, T>; template using mdspan_uncopyable_with_kernel_t = std::enable_if_t, T>; -template -auto static constexpr const mdspan_copyable_with_simd_v = mdspan_copyable::can_use_simd; -template -using mdspan_copyable_with_simd_t = std::enable_if_t, T>; - -template -struct simd_type_2x2 { - struct type { - type(T val0, T val1, T val2, T val3) : data{val0, val1, val2, val3} {} - private: - std::array data; - }; - auto static load(T const* row0_ptr, T const* row1_ptr) { - return type{row0_ptr[0], row0_ptr[1], row1_ptr[2], row1_ptr[3]}; - } - auto static transpose(type data) { - return type{data[0], data[2], data[1], data[3]}; - } - void static store(type data, T* row0_ptr, T* row1_ptr) { - row0_ptr[0] = data[0]; - row0_ptr[1] = data[1]; - row1_ptr[0] = data[2]; - row1_ptr[1] = data[3]; - } -}; - -#ifdef __SSE__ -template<> -struct simd_type_2x2 { - using type=__m128; - auto static load(float const* row0_ptr, float const* row1_ptr) { - return _mm_set_ps(row1_ptr[1], row1_ptr[0], row0_ptr[1], row0_ptr[0]); - } - auto static transpose(type data) { - return _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 1, 2, 0)); - } - void static store(type data, float* row0_ptr, float* row1_ptr) { - } -}; -#endif - -#ifdef __AVX__ -template<> -struct simd_type_2x2 { - using type=__m256; - auto static load(double const* row0_ptr, double const* row1_ptr) { - __m128d row0 = _mm_loadu_pd(row0_ptr); - __m128d row1 = _mm_loadu_pd(row1_ptr); - return _mm256_set_m128d(row1, row0); - } - auto static transpose(type data) { - return _mm256_permute4x64_pd(data, _MM_SHUFFLE(3, 1, 2, 0)); - } -}; -#endif - -template ::type> -struct simd_matrix_2x2 { - using value_type = std::remove_cv_t; - - simd_matrix_2x2(T const* row0_ptr, T const* row1_ptr) : data{simd_type::load(row0_ptr, row1_ptr)} {} - - auto transpose() { - return simd_type:: - } - auto store(value_type* row0_ptr, value_type* row1_ptr) { - _mm_storeu_ps(row0_ptr, row0); - _mm_storeu_ps(row1_ptr, row1); - } - auto transpose(T* row0_ptr, T* row1_ptr) { - transpose().store(row0_ptr, row1_ptr); - } - - private: - simd_type data; -}; - -template -struct simd_matrix_mxn { - std::vector rows; - IdxT row_length; -}; - -template -mdspan_copyable_with_simd_t mdspan_host_copy(DstType&& dst, SrcType const& src) { - using config = mdspan_copyable; -} - - - #ifdef __CUDACC__ auto static constexpr const mdspan_copy_tile_dim = 32; auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; @@ -625,34 +533,33 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr RAFT_LOG_WARN("can_use_simd"); #ifdef __SSE__ constexpr auto elem_per_vector = 4; // 4 floats per __m128 - auto i = 0; - for (; i < src.extent(0); i += elem_per_vector) { - auto j = 0; - for (; j < src.extent(1); j += elem_per_vector) { + + for (auto i = 0; i < src.extent(0); i += elem_per_vector) { + for (auto j = 0; j < src.extent(1); j += elem_per_vector) { // Load a row of 4 floats from src into row0 - auto row0 = _mm_loadu_ps(&src(i, j)); + __m128 row0 = _mm_loadu_ps(&src(i, j)); // Load the next row of 4 floats from src into row1 - auto row1 = _mm_loadu_ps(&src(i + 1, j)); + __m128 row1 = _mm_loadu_ps(&src(i + 1, j)); // Load another row of 4 floats from src into row2 - auto row2 = _mm_loadu_ps(&src(i + 2, j)); + __m128 row2 = _mm_loadu_ps(&src(i + 2, j)); // Load the final row of 4 floats from src into row3 - auto row3 = _mm_loadu_ps(&src(i + 3, j)); + __m128 row3 = _mm_loadu_ps(&src(i + 3, j)); // Shuffle elements from row0 and row1. tmp0 holds elements (0,1) from both row0 and row1 - auto tmp0 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(1, 0, 1, 0)); + __m128 tmp0 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(1, 0, 1, 0)); // Shuffle elements from row0 and row1. tmp2 holds elements (2,3) from both row0 and row1 - auto tmp2 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(3, 2, 3, 2)); + __m128 tmp2 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(3, 2, 3, 2)); // Shuffle elements from row2 and row3. tmp1 holds elements (0,1) from both row2 and row3 - auto tmp1 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(1, 0, 1, 0)); + __m128 tmp1 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(1, 0, 1, 0)); // Shuffle elements from row2 and row3. tmp3 holds elements (2,3) from both row2 and row3 - auto tmp3 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(3, 2, 3, 2)); + __m128 tmp3 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(3, 2, 3, 2)); // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into first row of dst. - _mm_storeu_ps(&dst(i, j), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0))); + _mm_storeu_ps(&dst(j, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0))); // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into second row of dst. - _mm_storeu_ps(&dst(i + 1, j), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1))); + _mm_storeu_ps(&dst(j + 1, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1))); // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into third row of dst. - _mm_storeu_ps(&dst(i + 2, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(2, 0, 2, 0))); + _mm_storeu_ps(&dst(j + 2, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(2, 0, 2, 0))); // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into fourth row of dst. _mm_storeu_ps(&dst(j + 3, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(3, 1, 3, 1))); } From 49d871a0aac38732b1abb4c0c0e43a8cf01b9528 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 11 Sep 2023 14:26:48 -0400 Subject: [PATCH 53/75] Disable initial SIMD implementation --- cpp/include/raft/core/detail/mdspan_copy.hpp | 57 +++++--------------- cpp/include/raft/core/mdspan_copy.cuh | 9 ++++ cpp/include/raft/core/mdspan_copy.hpp | 4 +- cpp/test/core/mdspan_copy.cu | 4 +- 4 files changed, 26 insertions(+), 48 deletions(-) diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/mdspan_copy.hpp index 4988933838..2b53610727 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/mdspan_copy.hpp @@ -41,6 +41,10 @@ namespace detail { template struct mdspan_copyable {}; +/* + * A helper struct used to determine whether one mdspan type can be copied to + * another and if so how + */ template struct mdspan_copyable { using dst_type = std::remove_reference_t; @@ -293,7 +297,7 @@ __device__ auto increment_indices( */ template __global__ mdspan_copyable_with_kernel_t -mdspan_device_copy(DstType dst, SrcType src) +mdspan_copy_kernel(DstType dst, SrcType src) { using config = mdspan_copyable; @@ -520,52 +524,18 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr typename config::index_type(mdspan_copy_tile_elems)) ); auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1}; - mdspan_device_copy<<>>(dst, src); + mdspan_copy_kernel<<>>(dst, src); #else - // Should never actually reach this because of enable_ifs + // Should never actually reach this because of enable_ifs. Included for + // safety. RAFT_FAIL( "raft::copy called in a way that requires custom kernel. Please use " "raft/core/mdspan_copy.cuh and include the header in a .cu file"); #endif } else if constexpr (config::can_use_std_copy) { std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); - } else if constexpr(config::can_use_simd) { - RAFT_LOG_WARN("can_use_simd"); -#ifdef __SSE__ - constexpr auto elem_per_vector = 4; // 4 floats per __m128 - - for (auto i = 0; i < src.extent(0); i += elem_per_vector) { - for (auto j = 0; j < src.extent(1); j += elem_per_vector) { - // Load a row of 4 floats from src into row0 - __m128 row0 = _mm_loadu_ps(&src(i, j)); - // Load the next row of 4 floats from src into row1 - __m128 row1 = _mm_loadu_ps(&src(i + 1, j)); - // Load another row of 4 floats from src into row2 - __m128 row2 = _mm_loadu_ps(&src(i + 2, j)); - // Load the final row of 4 floats from src into row3 - __m128 row3 = _mm_loadu_ps(&src(i + 3, j)); - - // Shuffle elements from row0 and row1. tmp0 holds elements (0,1) from both row0 and row1 - __m128 tmp0 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(1, 0, 1, 0)); - // Shuffle elements from row0 and row1. tmp2 holds elements (2,3) from both row0 and row1 - __m128 tmp2 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(3, 2, 3, 2)); - // Shuffle elements from row2 and row3. tmp1 holds elements (0,1) from both row2 and row3 - __m128 tmp1 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(1, 0, 1, 0)); - // Shuffle elements from row2 and row3. tmp3 holds elements (2,3) from both row2 and row3 - __m128 tmp3 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(3, 2, 3, 2)); - - // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into first row of dst. - _mm_storeu_ps(&dst(j, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0))); - // Final shuffle and store. Shuffle elements from tmp0 and tmp1 into second row of dst. - _mm_storeu_ps(&dst(j + 1, i), _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1))); - // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into third row of dst. - _mm_storeu_ps(&dst(j + 2, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(2, 0, 2, 0))); - // Final shuffle and store. Shuffle elements from tmp2 and tmp3 into fourth row of dst. - _mm_storeu_ps(&dst(j + 3, i), _mm_shuffle_ps(tmp2, tmp3, _MM_SHUFFLE(3, 1, 3, 1))); - } - } -#endif } else { + // TODO(wphicks): Make the following cache-oblivious and add SIMD support auto indices = std::array{}; for (auto i = std::size_t{}; i < dst.size(); ++i) { if (i != 0) { @@ -579,12 +549,9 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr } } else { // For layout_left/layout_f_contiguous (and currently all other - // layouts), we iterate over the leftmost extent fastest - - // TODO(wphicks): Add additional specialization for non-C/F - // arrays that have a stride of 1 in one dimension. This would - // be a performance enhancement; it is not required for - // correctness. + // layouts), we iterate over the leftmost extent fastest. The + // cache-oblivious implementation should work through dimensions in + // order of increasing stride. auto dim = std::size_t{}; while ((indices[dim]++) == src.extent(dim)) { indices[dim] = typename config::index_type{}; diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh index cd92ceaf67..9a5446a631 100644 --- a/cpp/include/raft/core/mdspan_copy.cuh +++ b/cpp/include/raft/core/mdspan_copy.cuh @@ -6,4 +6,13 @@ detail::mdspan_copyable_with_kernel_t copy(resources const& res, DstType&& dst, SrcType const& src) { detail::copy(res, dst, src); } + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +template +detail::mdspan_uncopyable_with_kernel_t +copy(resources const& res, DstType&& dst, SrcType const& src) { + detail::copy(res, dst, src); +} +#endif } // namespace raft diff --git a/cpp/include/raft/core/mdspan_copy.hpp b/cpp/include/raft/core/mdspan_copy.hpp index 7792a548db..58fca40bd5 100644 --- a/cpp/include/raft/core/mdspan_copy.hpp +++ b/cpp/include/raft/core/mdspan_copy.hpp @@ -1,12 +1,14 @@ #pragma once #include -#include namespace raft { +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED template detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, DstType&& dst, SrcType const& src) { detail::copy(res, dst, src); } +#endif } // namespace raft diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu index 817067f3d3..f5c44da97f 100644 --- a/cpp/test/core/mdspan_copy.cu +++ b/cpp/test/core/mdspan_copy.cu @@ -181,7 +181,7 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) } } - /* // Test transpose + // Test transpose auto out_left = make_host_mdarray( res, extents{}); auto out_right = make_host_mdarray( @@ -205,7 +205,7 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); } } - } */ + } } TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) From cb24abc02a2b4ce370a9a24602d2c148e7ced376 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 11 Sep 2023 18:27:35 -0400 Subject: [PATCH 54/75] Rename mdspan copy headers --- cpp/include/raft/core/copy.cuh | 22 + cpp/include/raft/core/copy.hpp | 16 + .../core/detail/{mdspan_copy.hpp => copy.hpp} | 265 +++++---- cpp/include/raft/core/mdbuffer.hpp | 511 ++++++++---------- cpp/include/raft/core/mdspan_copy.cuh | 18 - cpp/include/raft/core/mdspan_copy.hpp | 14 - cpp/test/core/mdspan_copy.cpp | 81 ++- cpp/test/core/mdspan_copy.cu | 77 ++- 8 files changed, 455 insertions(+), 549 deletions(-) create mode 100644 cpp/include/raft/core/copy.cuh create mode 100644 cpp/include/raft/core/copy.hpp rename cpp/include/raft/core/detail/{mdspan_copy.hpp => copy.hpp} (70%) delete mode 100644 cpp/include/raft/core/mdspan_copy.cuh delete mode 100644 cpp/include/raft/core/mdspan_copy.hpp diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh new file mode 100644 index 0000000000..f3b25f8a45 --- /dev/null +++ b/cpp/include/raft/core/copy.cuh @@ -0,0 +1,22 @@ +#pragma once +#include +namespace raft { +template +detail::mdspan_copyable_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType const& src) +{ + detail::copy(res, dst, src); +} + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +template +detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType const& src) +{ + detail::copy(res, dst, src); +} +#endif +} // namespace raft diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp new file mode 100644 index 0000000000..f8854b3374 --- /dev/null +++ b/cpp/include/raft/core/copy.hpp @@ -0,0 +1,16 @@ +#pragma once +#include +namespace raft { + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +template +detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType const& src) +{ + detail::copy(res, dst, src); +} +#endif + +} // namespace raft diff --git a/cpp/include/raft/core/detail/mdspan_copy.hpp b/cpp/include/raft/core/detail/copy.hpp similarity index 70% rename from cpp/include/raft/core/detail/mdspan_copy.hpp rename to cpp/include/raft/core/detail/copy.hpp index 2b53610727..3c820a005e 100644 --- a/cpp/include/raft/core/detail/mdspan_copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -81,8 +81,8 @@ struct mdspan_copyable { auto static constexpr const dst_rank = dst_extents_type::rank(); auto static constexpr const src_rank = src_extents_type::rank(); auto static constexpr const compatible_rank = (dst_rank == src_rank); - auto static constexpr const has_vector_rank = (dst_rank == 1); - auto static constexpr const has_matrix_rank = (dst_rank == 2); + auto static constexpr const has_vector_rank = (dst_rank == 1); + auto static constexpr const has_matrix_rank = (dst_rank == 2); // Layout properties using dst_layout_type = typename dst_type::layout_type; @@ -125,8 +125,10 @@ struct mdspan_copyable { #if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) // TODO(wphicks): Following should be only necessary restrictions. Test if // perf actually improves once fully implemented. - // auto static constexpr const can_use_simd = can_use_host && both_contiguous && both_float_or_both_double; - auto static constexpr const can_use_simd = can_use_host && both_contiguous && both_float && has_matrix_rank; + // auto static constexpr const can_use_simd = can_use_host && both_contiguous && + // both_float_or_both_double; + auto static constexpr const can_use_simd = + can_use_host && both_contiguous && both_float && has_matrix_rank; #else auto static constexpr const can_use_simd = false; #endif @@ -154,12 +156,9 @@ struct mdspan_copyable { std::bool_constant>; auto static constexpr const can_use_device = std::conjunction_v, - std::disjunction< - std::bool_constant, - std::bool_constant, - std::bool_constant - > - >; + std::disjunction, + std::bool_constant, + std::bool_constant>>; auto static constexpr const can_use_cublas = std::conjunction_v, @@ -178,48 +177,51 @@ struct mdspan_copyable { std::bool_constant>; // Viable overload? - auto static constexpr const value = std::conjunction_v< - std::bool_constant>, - std::bool_constant>, - std::bool_constant - >; + auto static constexpr const value = + std::conjunction_v>, + std::bool_constant>, + std::bool_constant>; using type = std::enable_if_t; }; template using mdspan_copyable_t = typename mdspan_copyable::type; template -auto static constexpr const mdspan_copyable_v = mdspan_copyable::value; +auto static constexpr const mdspan_copyable_v = + mdspan_copyable::value; template -auto static constexpr const mdspan_copyable_with_kernel_v = mdspan_copyable::custom_kernel_allowed; +auto static constexpr const mdspan_copyable_with_kernel_v = + mdspan_copyable::custom_kernel_allowed; template -auto static constexpr const mdspan_uncopyable_with_kernel_v = !mdspan_copyable::custom_kernel_allowed; +auto static constexpr const mdspan_uncopyable_with_kernel_v = + !mdspan_copyable::custom_kernel_allowed; +template +using mdspan_copyable_with_kernel_t = + std::enable_if_t, T>; -template -using mdspan_copyable_with_kernel_t = std::enable_if_t, T>; - -template -using mdspan_uncopyable_with_kernel_t = std::enable_if_t, T>; +template +using mdspan_uncopyable_with_kernel_t = + std::enable_if_t, T>; #ifdef __CUDACC__ -auto static constexpr const mdspan_copy_tile_dim = 32; +auto static constexpr const mdspan_copy_tile_dim = 32; auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; // Helper struct to work around lack of CUDA-native std::apply -template -struct index_sequence { -}; +template +struct index_sequence {}; -template -struct make_index_sequence : std::conditional_t< - N == IdxType{}, - index_sequence, - make_index_sequence> {}; +template +struct make_index_sequence + : std::conditional_t, + make_index_sequence> {}; /* template -__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence) +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence) { return lambda(args[Idx]...); } @@ -227,38 +229,40 @@ __host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, in template __host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args) { - return apply(std::forward(lambda), std::forward(args), make_index_sequence{}); + return apply(std::forward(lambda), std::forward(args), +make_index_sequence{}); } */ - /* * Given an mdspan and an array of indices, return a reference to the * indicated element. */ template -__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices, index_sequence) +__device__ auto& get_mdspan_elem(MdspanType& md, + IdxType const* indices, + index_sequence) { return md(indices[Idx]...); } template -__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices) { - return get_mdspan_elem(md, indices, make_index_sequence{}); +__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices) +{ + return get_mdspan_elem( + md, indices, make_index_sequence{}); } /* Advance old_indices forward by the number of mdspan elements specified * by increment. Store the result in indices. Return true if the new * indices are valid for the input mdspan. */ -template -__device__ auto increment_indices( - IdxType* indices, - MdspanType const& md, - IdxType const* old_indices, - IdxType const* index_strides, - IdxType increment -) { +template +__device__ auto increment_indices(IdxType* indices, + MdspanType const& md, + IdxType const* old_indices, + IdxType const* index_strides, + IdxType increment) +{ #pragma unroll for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { increment += index_strides[i] * old_indices[i]; @@ -277,14 +281,16 @@ __device__ auto increment_indices( auto cur_index = IdxType{}; - // printf("pre-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], int(increment)); + // printf("pre-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], + // int(increment)); while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) { increment -= index_strides[real_index]; ++cur_index; } indices[real_index] = cur_index; } - // printf("post-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], int(increment)); + // printf("post-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], + // int(increment)); return increment == IdxType{}; } @@ -296,8 +302,8 @@ __device__ auto increment_indices( * parameters. */ template -__global__ mdspan_copyable_with_kernel_t -mdspan_copy_kernel(DstType dst, SrcType src) +__global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(DstType dst, + SrcType src) { using config = mdspan_copyable; @@ -376,58 +382,51 @@ mdspan_copy_kernel(DstType dst, SrcType src) ); */ typename config::index_type cur_indices[config::dst_rank]; auto valid_tile = increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - blockIdx.x * mdspan_copy_tile_elems - ); + tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems); while (valid_tile) { - auto tile_read_x = std::is_same_v ? threadIdx.x : threadIdx.y; - auto tile_read_y = std::is_same_v ? threadIdx.y : threadIdx.x; - - auto valid_index = increment_indices( - cur_indices, - src, - tile_offset, - index_strides, - tile_read_x * mdspan_copy_tile_dim + tile_read_y - ); + auto tile_read_x = std::is_same_v + ? threadIdx.x + : threadIdx.y; + auto tile_read_y = std::is_same_v + ? threadIdx.y + : threadIdx.x; + + auto valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_x * mdspan_copy_tile_dim + tile_read_y); if constexpr (config::same_underlying_layout || !config::dst_contiguous) { if (valid_index) { - tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); + tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y]; } } else { if (valid_index) { - // printf("read: %d %d %d -> %d %d: %d\n", cur_indices[0], cur_indices[1], cur_indices[2], tile_read_x, tile_read_y, int(get_mdspan_elem(src, cur_indices))); + // printf("read: %d %d %d -> %d %d: %d\n", cur_indices[0], cur_indices[1], cur_indices[2], + // tile_read_x, tile_read_y, int(get_mdspan_elem(src, cur_indices))); tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } __syncthreads(); - valid_index = increment_indices( - cur_indices, - src, - tile_offset, - index_strides, - tile_read_y * mdspan_copy_tile_dim + tile_read_x - ); + valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_y * mdspan_copy_tile_dim + tile_read_x); if (valid_index) { - // printf("write: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], cur_indices[1], cur_indices[2], int(tile[tile_read_y][tile_read_x])); + // printf("write: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], + // cur_indices[1], cur_indices[2], int(tile[tile_read_y][tile_read_x])); get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; - // printf("final: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], cur_indices[1], cur_indices[2], int(get_mdspan_elem(dst, cur_indices))); + // printf("final: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], + // cur_indices[1], cur_indices[2], int(get_mdspan_elem(dst, cur_indices))); } __syncthreads(); } valid_tile = increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - blockDim.x * mdspan_copy_tile_elems - ); + tile_offset, src, tile_offset, index_strides, blockDim.x * mdspan_copy_tile_elems); } } #endif @@ -443,32 +442,24 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr if constexpr (config::use_intermediate_src) { // Copy to intermediate source on device, then perform necessary // changes in layout on device, directly into final destination - using mdarray_t = device_mdarray< - typename config::src_value_type, - typename config::src_extents_type, - typename config::src_layout_type - >; - auto intermediate = mdarray_t( - res, - typename mdarray_t::mapping_type{src.extents()}, - typename mdarray_t::container_policy_type{} - ); + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{src.extents()}, + typename mdarray_t::container_policy_type{}); detail::copy(res, intermediate.view(), src); detail::copy(res, dst, intermediate.view()); } else if constexpr (config::use_intermediate_dst) { // Perform necessary changes in layout on device, then copy to final // destination on host - using mdarray_t = device_mdarray< - typename config::dst_value_type, - typename config::dst_extents_type, - typename config::dst_layout_type - >; - auto intermediate = mdarray_t( - res, - typename mdarray_t::mapping_type{dst.extents()}, - typename mdarray_t::container_policy_type{} - ); + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{dst.extents()}, + typename mdarray_t::container_policy_type{}); detail::copy(res, intermediate.view(), src); detail::copy(res, dst, intermediate.view()); } else if constexpr (config::can_use_raft_copy) { @@ -479,37 +470,35 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; auto constexpr const beta = typename std::remove_reference_t::value_type{0}; if constexpr (std::is_same_v) { - CUBLAS_TRY( - linalg::detail::cublasgeam(resource::get_cublas_handle(res), - CUBLAS_OP_T, - CUBLAS_OP_N, - dst.extent(1), - dst.extent(0), - &alpha, - src.data_handle(), - src.extent(0), - &beta, - dst.data_handle(), - dst.extent(1), - dst.data_handle(), - dst.extent(1), - resource::get_cuda_stream(res))); + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(1), + dst.extent(0), + &alpha, + src.data_handle(), + src.extent(0), + &beta, + dst.data_handle(), + dst.extent(1), + dst.data_handle(), + dst.extent(1), + resource::get_cuda_stream(res))); } else { - CUBLAS_TRY( - linalg::detail::cublasgeam(resource::get_cublas_handle(res), - CUBLAS_OP_T, - CUBLAS_OP_N, - dst.extent(0), - dst.extent(1), - &alpha, - src.data_handle(), - src.extent(1), - &beta, - dst.data_handle(), - dst.extent(0), - dst.data_handle(), - dst.extent(0), - resource::get_cuda_stream(res))); + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(0), + dst.extent(1), + &alpha, + src.data_handle(), + src.extent(1), + &beta, + dst.data_handle(), + dst.extent(0), + dst.data_handle(), + dst.extent(0), + resource::get_cuda_stream(res))); } } else if constexpr (config::custom_kernel_allowed) { #ifdef __CUDACC__ @@ -519,10 +508,8 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr // sufficient considering that this kernel will likely overlap with // real computations for most use cases. typename config::index_type{32}, - raft::ceildiv( - typename config::index_type(dst.size()), - typename config::index_type(mdspan_copy_tile_elems)) - ); + raft::ceildiv(typename config::index_type(dst.size()), + typename config::index_type(mdspan_copy_tile_elems))); auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1}; mdspan_copy_kernel<<>>(dst, src); #else @@ -530,7 +517,7 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr // safety. RAFT_FAIL( "raft::copy called in a way that requires custom kernel. Please use " - "raft/core/mdspan_copy.cuh and include the header in a .cu file"); + "raft/core/copy.cuh and include the header in a .cu file"); #endif } else if constexpr (config::can_use_std_copy) { std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp index a73e5b1249..844a8a2c45 100644 --- a/cpp/include/raft/core/mdbuffer.hpp +++ b/cpp/include/raft/core/mdbuffer.hpp @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include #include #include @@ -29,6 +27,8 @@ #include #include #include +#include +#include #ifndef RAFT_DISABLE_CUDA #include #include @@ -36,138 +36,42 @@ namespace raft { -inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type) { +inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type) +{ return static_cast>(mem_type); } template -using alternate_from_mem_type = std::variant_alternative_t; - -namespace detail { - -template < - typename DstElementType, - typename DstExtents, - typename DstLayoutPolicy, - typename DstAccessorPolicy, - typename SrcElementType, - typename SrcExtents, - typename SrcLayoutPolicy, - typename SrcAccessorPolicy, - typename ExecutionPolicy, - std::enable_if_t, - SrcExtents::rank() == DstExtents::rank() - >>* = nullptr -> -void copy( - resources const& res, - mdspan & dst, - mdspan const& src, - ExecutionPolicy host_exec_policy = std::execution::unseq -) { - // TODO(Check size match?) - if constexpr ( - // Contiguous memory, no transpose required - std::conjunction_v< - std::is_same_v, - std::disjunction_v< - std::is_same_v, - std::is_same_v - > - > - ) { - if constexpr ( - std::disjunction_v< - std::conjunction_v< - CUDA_ENABLED, - ! DstAccessorPolicy::mem_type::is_device_accessible, - ! SrcAccessorPolicy::mem_type::is_device_accessible - >, - std::conjunction_v< - ! CUDA_ENABLED, - DstAccessorPolicy::mem_type::is_host_accessible, - SrcAccessorPolicy::mem_type::is_host_accessible - >, - > - ) { - std::copy( - host_exec_policy, - src.data_handle(), - src.data_handle() + src.size(), - dst.data_handle() - ); - } else { -#ifndef RAFT_DISABLE_CUDA - if constexpr(std::is_same_v)) { - raft::copy( - dst.data_handle(), - src.data_handle(), - src.size(), - get_stream_view(res) - ); - } else { - // TODO(wphicks): Convert type on src device and then copy - } -#else - throw non_cuda_build_error{ - "Attempted copy to/from device in non-CUDA build" - }; -#endif - } - } else { // Non-contiguous memory or transpose required - if constexpr ( - std::conjunction_v< - DstAccessorPolicy::mem_type::is_device_accessible, - SrcAccessorPolicy::mem_type::is_device_accessible - > - ) { - // TODO(wphicks): Conversion/transpose kernel - } else if constexpr ( - std::conjunction_v< - DstAccessorPolicy::mem_type::is_host_accessible, - SrcAccessorPolicy::mem_type::is_host_accessible - > - ) { - // TODO(wphicks): CPU conversion - } else { - // TODO(wphicks): Copy to intermediate mdarray on dest device, then call - // recursively for transpose/conversion - } - } -} -} // namespace detail - +using alternate_from_mem_type = + std::variant_alternative_t; template -using default_container_policy_variant = std::variant< - host_vector_policy, - device_uvector_policy, - managed_uvector_policy, - pinned_vector_policy ->; - -template > +using default_container_policy_variant = std::variant, + device_uvector_policy, + managed_uvector_policy, + pinned_vector_policy>; + +template > struct universal_buffer_reference { using value_type = typename std::remove_cv_t; - using pointer = value_type*; + using pointer = value_type*; using const_pointer = value_type const*; - universal_buffer_reference(pointer ptr, memory_type mem_type, stream_view stream=stream_view_per_thread) + universal_buffer_reference(pointer ptr, + memory_type mem_type, + stream_view stream = stream_view_per_thread) : ptr_{ptr}, mem_type_{mem_type}, stream_{stream} { } #ifndef RAFT_DISABLE_CUDA explicit universal_buffer_reference(thrust::device_ptr ptr, - memory_type mem_type=memory_type::device, - stream_view stream=stream_view_per_thread) + memory_type mem_type = memory_type::device, + stream_view stream = stream_view_per_thread) : universal_buffer_reference{ptr.get(), mem_type, stream} { - RAFT_EXPECTS( - is_device_accessible(mem_type), - "Attempted to create host-only reference from Thrust device pointer" - ); + RAFT_EXPECTS(is_device_accessible(mem_type), + "Attempted to create host-only reference from Thrust device pointer"); } #endif @@ -178,11 +82,9 @@ struct universal_buffer_reference { result = *ptr_; } else { #ifdef RAFT_DISABLE_CUDA - throw non_cuda_build_error{ - "Attempted to access device reference in non-CUDA build" - }; + throw non_cuda_build_error{"Attempted to access device reference in non-CUDA build"}; #else - update_host(&result, ptr_, 1, stream_); + update_host(&result, ptr_, 1, stream_); #endif } return result; @@ -194,9 +96,7 @@ struct universal_buffer_reference { *ptr_ = other; } else { #ifdef RAFT_DISABLE_CUDA - throw non_cuda_build_error{ - "Attempted to assign to device reference in non-CUDA build" - }; + throw non_cuda_build_error{"Attempted to assign to device reference in non-CUDA build"}; #else update_device(ptr_, &other, 1, stream_); #endif @@ -210,163 +110,152 @@ struct universal_buffer_reference { raft::stream_view stream_; }; -template < - typename ElementType, - typename ContainerPolicyVariant=default_container_policy_variant -> +template > struct default_buffer_container_policy { using element_type = ElementType; - using value_type = std::remove_cv_t; + using value_type = std::remove_cv_t; using reference = universal_buffer_reference; using const_reference = universal_buffer_reference; - using pointer = element_type*; - using const_pointer = element_type const*; + using pointer = element_type*; + using const_pointer = element_type const*; using container_policy_variant = ContainerPolicyVariant; template - using container_policy = host_device_accessor, MemType>; + using container_policy = + host_device_accessor, MemType>; private: template using container_policy_at_index = std::variant_alternative_t; public: - using container_type_variant = std::variant< - typename container_policy_at_index<0>::container_type, - typename container_policy_at_index<1>::container_type, - typename container_policy_at_index<2>::container_type, - typename container_policy_at_index<3>::container_type - >; + using container_type_variant = + std::variant::container_type, + typename container_policy_at_index<1>::container_type, + typename container_policy_at_index<2>::container_type, + typename container_policy_at_index<3>::container_type>; template using container_type = alternate_from_mem_type; - using accessor_policy_variant = std::variant< - typename container_policy_at_index<0>::accessor_policy, - typename container_policy_at_index<1>::accessor_policy, - typename container_policy_at_index<2>::accessor_policy, - typename container_policy_at_index<3>::accessor_policy - >; + using accessor_policy_variant = + std::variant::accessor_policy, + typename container_policy_at_index<1>::accessor_policy, + typename container_policy_at_index<2>::accessor_policy, + typename container_policy_at_index<3>::accessor_policy>; template using accessor_policy = alternate_from_mem_type; - using const_accessor_policy_variant = std::variant< - typename container_policy_at_index<0>::const_accessor_policy, - typename container_policy_at_index<1>::const_accessor_policy, - typename container_policy_at_index<2>::const_accessor_policy, - typename container_policy_at_index<3>::const_accessor_policy - >; + using const_accessor_policy_variant = + std::variant::const_accessor_policy, + typename container_policy_at_index<1>::const_accessor_policy, + typename container_policy_at_index<2>::const_accessor_policy, + typename container_policy_at_index<3>::const_accessor_policy>; template using const_accessor_policy = alternate_from_mem_type; template - auto create(raft::resources const& res, size_t n) { + auto create(raft::resources const& res, size_t n) + { return container_type(res, n); } - auto create(raft::resources const& res, size_t n, raft::memory_type mem_type) { + auto create(raft::resources const& res, size_t n, raft::memory_type mem_type) + { auto result = container_type_variant{}; - switch(mem_type) { - case raft::memory_type::host: - result = create(res, n); - break; - case raft::memory_type::device: - result = create(res, n); - break; - case raft::memory_type::managed: - result = create(res, n); - break; - case raft::memory_type::pinned: - result = create(res, n); - break; + switch (mem_type) { + case raft::memory_type::host: result = create(res, n); break; + case raft::memory_type::device: result = create(res, n); break; + case raft::memory_type::managed: result = create(res, n); break; + case raft::memory_type::pinned: result = create(res, n); break; } return result; } private: template - auto static constexpr has_stream() -> decltype(std::declval().stream(), bool()) { + auto static constexpr has_stream() -> decltype(std::declval().stream(), bool()) + { return true; }; - auto static constexpr has_stream(...) -> bool { - return false; - }; + auto static constexpr has_stream(...) -> bool { return false; }; public: - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept { + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept + { return reference{c.data() + n, MemType, c.stream()}; } - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept { + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept + { return reference{c.data() + n, MemType}; } - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type const& c, std::size_t n) const noexcept { + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type const& c, + std::size_t n) const noexcept + { return const_reference{c.data() + n, MemType, c.stream()}; } - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type const& c, std::size_t n) const noexcept { + template >()>* = nullptr> + [[nodiscard]] auto constexpr access(container_type const& c, + std::size_t n) const noexcept + { return const_reference{c.data() + n, MemType}; } - template - [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } - template - [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + template + [[nodiscard]] auto make_accessor_policy() noexcept + { + return accessor_policy{}; + } + template + [[nodiscard]] auto make_accessor_policy() const noexcept + { + return const_accessor_policy{}; + } - [[nodiscard]] auto make_accessor_policy(memory_type mem_type) noexcept { + [[nodiscard]] auto make_accessor_policy(memory_type mem_type) noexcept + { auto result = accessor_policy_variant{}; - switch(mem_type) { - case memory_type::host: - result = make_accessor_policy(); - break; - case memory_type::device: - result = make_accessor_policy(); - break; - case memory_type::managed: - result = make_accessor_policy(); - break; - case memory_type::pinned: - result = make_accessor_policy(); - break; + switch (mem_type) { + case memory_type::host: result = make_accessor_policy(); break; + case memory_type::device: result = make_accessor_policy(); break; + case memory_type::managed: result = make_accessor_policy(); break; + case memory_type::pinned: result = make_accessor_policy(); break; } return result; -} - [[nodiscard]] auto make_accessor_policy(memory_type mem_type) const noexcept { + } + [[nodiscard]] auto make_accessor_policy(memory_type mem_type) const noexcept + { auto result = const_accessor_policy_variant{}; - switch(mem_type) { - case memory_type::host: - result = make_accessor_policy(); - break; - case memory_type::device: - result = make_accessor_policy(); - break; - case memory_type::managed: - result = make_accessor_policy(); - break; - case memory_type::pinned: - result = make_accessor_policy(); - break; + switch (mem_type) { + case memory_type::host: result = make_accessor_policy(); break; + case memory_type::device: result = make_accessor_policy(); break; + case memory_type::managed: result = make_accessor_policy(); break; + case memory_type::pinned: result = make_accessor_policy(); break; } return result; -} - + } }; -template < - typename ElementType, - typename Extents, - typename LayoutPolicy = layout_c_contiguous, - typename ContainerPolicy = default_buffer_container_policy -> struct mdbuffer { +template > +struct mdbuffer { using extents_type = Extents; using layout_type = LayoutPolicy; using mapping_type = typename layout_type::template mapping; @@ -377,88 +266,118 @@ template < using difference_type = std::ptrdiff_t; using rank_type = typename extents_type::rank_type; - using container_policy_type = ContainerPolicy; + using container_policy_type = ContainerPolicy; + using accessor_policy_variant = typename ContainerPolicy::accessor_policy_variant; + + template + using accessor_policy = alternate_from_mem_type; using container_type_variant = typename container_policy_type::container_type_variant; template using container_type = typename container_policy_type::template container_type; - using pointer = typename container_policy_type::pointer; - using const_pointer = typename container_policy_type::const_pointer; - using reference = typename container_policy_type::reference; + using pointer = typename container_policy_type::pointer; + using const_pointer = typename container_policy_type::const_pointer; + using reference = typename container_policy_type::reference; using const_reference = typename container_policy_type::const_reference; template - using owning_type = mdarray< - element_type, - extents_type, - layout_type, - typename container_policy_type::template container_policy - >; - using owning_type_variant = std::variant< - owning_type(0)>, - owning_type(1)>, - owning_type(2)>, - owning_type(3)> - >; + using owning_type = mdarray>; + using owning_type_variant = std::variant(0)>, + owning_type(1)>, + owning_type(2)>, + owning_type(3)>>; template using view_type = typename owning_type::view_type; - using view_type_variant = std::variant< - view_type(0)>, - view_type(1)>, - view_type(2)>, - view_type(3)> - >; + using view_type_variant = std::variant(0)>, + view_type(1)>, + view_type(2)>, + view_type(3)>>; template - using const_view_type = typename owning_type::const_view_type; - using const_view_type_variant = std::variant< - const_view_type(0)>, - const_view_type(1)>, - const_view_type(2)>, - const_view_type(3)> - >; + using const_view_type = typename owning_type::const_view_type; + using const_view_type_variant = std::variant(0)>, + const_view_type(1)>, + const_view_type(2)>, + const_view_type(3)>>; using storage_type_variant = concatenated_variant_t; template - using storage_type = std::variant_alternative_t< - std::size_t{is_owning} * std::variant_size_v - + std::size_t{variant_index_from_memory_type(MemType)}, - storage_type_variant - >; + using storage_type = + std::variant_alternative_t + + std::size_t{variant_index_from_memory_type(MemType)}, + storage_type_variant>; constexpr mdbuffer() = default; - template , storage_type_variant>>* = nullptr> + template , + storage_type_variant>>* = nullptr> constexpr mdbuffer(mdspan other) : data_{std::move(other)} { } - template ::view_type, storage_type_variant>>* = nullptr> + template , + storage_type_variant>>* = nullptr> + constexpr mdbuffer(mdspan other, + memory_type mem_type) + : data_{[mem_type]() { + auto result = storage_type_variant{}; + if constexpr (AccessorPolicy::is_host_device_accessible()) { + if (mem_type != memory_type::host || mem_type != memory_type::device || + mem_type != memory_type::managed) { + // TODO(wphicks): Build owning variant and copy + } + } else if constexpr (AccessorPolicy::is_host_accessible()) { + if (mem_type != memory_type::host) { + // TODO(wphicks): Build owning variant and copy + } + } else if constexpr (AccessorPolicy::is_device_accessible()) { + if (mem_type != memory_type::device) { + // TODO(wphicks): Build owning variant and copy + } + } + return result; + }()} + { + } + + template ::view_type, + storage_type_variant>>* = nullptr> constexpr mdbuffer(mdarray& other) : mdbuffer{other.view()} { } - template , storage_type_variant>>* = nullptr> + template , + storage_type_variant>>* = nullptr> constexpr mdbuffer(mdarray&& other) : data_{std::move(other)} { } - template , - Extents::rank() == OtherExtents::rank() - >>* = nullptr> + template < + typename OtherElementType = ElementType, + typename OtherExtents = Extents, + typename OtherLayoutPolicy = LayoutPolicy, + typename OtherContainerPolicy = ContainerPolicy, + std::enable_if_t, + Extents::rank() == OtherExtents::rank()>>* = nullptr> constexpr mdbuffer( resources const& res, mdbuffer const& other) @@ -466,53 +385,55 @@ template < { } - [[nodiscard]] auto constexpr mem_type() { + [[nodiscard]] auto constexpr mem_type() + { return static_cast(data_.index() % std::variant_size_v); }; - [[nodiscard]] auto constexpr is_owning() { + [[nodiscard]] auto constexpr is_owning() + { return data_.index() >= std::variant_size_v; }; - [[nodiscard]] auto constexpr data_handle() { - return fast_visit([](auto&& inner) { - if constexpr (std::is_convertible_v) { - return pointer{inner.data_handle()}; - } else { - return pointer{inner.data_handle().get()}; - } - }, data_); + [[nodiscard]] auto constexpr data_handle() + { + return fast_visit( + [](auto&& inner) { + if constexpr (std::is_convertible_v) { + return pointer{inner.data_handle()}; + } else { + return pointer{inner.data_handle().get()}; + } + }, + data_); }; - [[nodiscard]] auto constexpr data_handle() const { - return fast_visit([](auto&& inner) { - if constexpr (std::is_convertible_v) { - return const_pointer{inner.data_handle()}; - } else { - return const_pointer{inner.data_handle().get()}; - } - }, data_); + [[nodiscard]] auto constexpr data_handle() const + { + return fast_visit( + [](auto&& inner) { + if constexpr (std::is_convertible_v) { + return const_pointer{inner.data_handle()}; + } else { + return const_pointer{inner.data_handle().get()}; + } + }, + data_); } private: - static auto constexpr get_view_from_data(view_type_variant const& data) { - return data; - } - static auto constexpr get_view_from_data(const_view_type_variant const& data) { - return data; - } - static auto constexpr get_view_from_data(owning_type_variant& data) { + static auto constexpr get_view_from_data(view_type_variant const& data) { return data; } + static auto constexpr get_view_from_data(const_view_type_variant const& data) { return data; } + static auto constexpr get_view_from_data(owning_type_variant& data) + { return view_type_variant{data.view()}; } - static auto constexpr get_view_from_data(owning_type_variant const& data) { + static auto constexpr get_view_from_data(owning_type_variant const& data) + { return const_view_type_variant{data.view()}; } public: - [[nodiscard]] auto view() { - return fast_visit( - [](auto&& inner) { - return get_view_from_data(inner); - }, - data_ - ); + [[nodiscard]] auto view() + { + return fast_visit([](auto&& inner) { return get_view_from_data(inner); }, data_); } private: diff --git a/cpp/include/raft/core/mdspan_copy.cuh b/cpp/include/raft/core/mdspan_copy.cuh deleted file mode 100644 index 9a5446a631..0000000000 --- a/cpp/include/raft/core/mdspan_copy.cuh +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once -#include -namespace raft { -template -detail::mdspan_copyable_with_kernel_t -copy(resources const& res, DstType&& dst, SrcType const& src) { - detail::copy(res, dst, src); -} - -#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED -#define RAFT_NON_CUDA_COPY_IMPLEMENTED -template -detail::mdspan_uncopyable_with_kernel_t -copy(resources const& res, DstType&& dst, SrcType const& src) { - detail::copy(res, dst, src); -} -#endif -} // namespace raft diff --git a/cpp/include/raft/core/mdspan_copy.hpp b/cpp/include/raft/core/mdspan_copy.hpp deleted file mode 100644 index 58fca40bd5..0000000000 --- a/cpp/include/raft/core/mdspan_copy.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once -#include -namespace raft { - -#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED -#define RAFT_NON_CUDA_COPY_IMPLEMENTED -template -detail::mdspan_uncopyable_with_kernel_t -copy(resources const& res, DstType&& dst, SrcType const& src) { - detail::copy(res, dst, src); -} -#endif - -} // namespace raft diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp index a8e60ee848..bb11b8dadc 100644 --- a/cpp/test/core/mdspan_copy.cpp +++ b/cpp/test/core/mdspan_copy.cpp @@ -17,17 +17,17 @@ #include "../test_utils.h" #include #include +#include +#include #include #include -#include -#include namespace raft { TEST(MDSpanCopy, Mdspan1DHostHost) { - auto res = device_resources{}; - auto cols = std::uint32_t{2}; - auto in_left = make_host_vector(res, cols); + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); auto gen_unique_entry = [](auto&& x) { return x; }; for (auto i = std::uint32_t{}; i < cols; ++i) { @@ -38,17 +38,15 @@ TEST(MDSpanCopy, Mdspan1DHostHost) // std::copy copy(res, out_right.view(), in_left.view()); for (auto i = std::uint32_t{}; i < cols; ++i) { - ASSERT_TRUE(match(out_right(i), - double(gen_unique_entry(i)), - CompareApprox{0.0001})); + ASSERT_TRUE(match(out_right(i), double(gen_unique_entry(i)), CompareApprox{0.0001})); } } TEST(MDSpanCopy, Mdspan1DHostDevice) { - auto res = device_resources{}; - auto cols = std::uint32_t{2}; - auto in_left = make_host_vector(res, cols); + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); auto gen_unique_entry = [](auto&& x) { return x; }; for (auto i = std::uint32_t{}; i < cols; ++i) { @@ -60,17 +58,16 @@ TEST(MDSpanCopy, Mdspan1DHostDevice) copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < cols; ++i) { - ASSERT_TRUE(match(float(out_right(i)), - float(gen_unique_entry(i)), - CompareApprox{0.0001f})); + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); } } TEST(MDSpanCopy, Mdspan1DDeviceHost) { - auto res = device_resources{}; - auto cols = std::uint32_t{2}; - auto in_left = make_device_vector(res, cols); + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_device_vector(res, cols); auto gen_unique_entry = [](auto&& x) { return x; }; for (auto i = std::uint32_t{}; i < cols; ++i) { @@ -82,9 +79,8 @@ TEST(MDSpanCopy, Mdspan1DDeviceHost) copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < cols; ++i) { - ASSERT_TRUE(match(float(out_right(i)), - float(gen_unique_entry(i)), - CompareApprox{0.0001f})); + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); } } @@ -161,7 +157,7 @@ TEST(MDSpanCopy, Mdspan3DHostHost) TEST(MDSpanCopy, Mdspan3DHostDevice) { - auto res = device_resources{}; + auto res = device_resources{}; // Use smaller values here since host/device copy takes awhile. // Non-trivial logic is tested in the other cases. auto constexpr depth = std::uint32_t{5}; @@ -184,8 +180,9 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) auto out_left = make_device_mdarray( res, extents{}); - auto out_right = make_device_mdarray( - res, extents{}); + auto out_right = + make_device_mdarray( + res, extents{}); // raft::copy copy(res, out_right.view(), in_right.view()); @@ -193,8 +190,9 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - float(out_right(i, j, k)), float(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + ASSERT_TRUE(match(float(out_right(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); } } } @@ -227,8 +225,9 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - float(out_left(i, j, k)), float(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + ASSERT_TRUE(match(float(out_left(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); } } } @@ -236,10 +235,10 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) TEST(MDSpanCopy, Mdspan2DDeviceDevice) { - auto res = device_resources{}; - auto constexpr rows = std::uint32_t{300}; - auto constexpr cols = std::uint32_t{200}; - auto in_left = make_device_mdarray( + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_device_mdarray( res, extents{}); auto in_right = make_device_mdarray( res, extents{}); @@ -262,8 +261,8 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { - ASSERT_TRUE(match( - float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } @@ -272,8 +271,8 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { - ASSERT_TRUE(match( - float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } @@ -282,8 +281,8 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { - ASSERT_TRUE(match( - float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + ASSERT_TRUE( + match(float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } } @@ -309,10 +308,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) } } - auto out_left = make_device_mdarray( - res, extents{}); - auto out_right = make_device_mdarray( - res, extents{}); + auto out_left = make_device_mdarray( res, extents{}); auto out_right = +make_device_mdarray( res, +extents{}); // Custom kernel copy(res, out_right.view(), in_right.view()); diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu index f5c44da97f..78a128ee6e 100644 --- a/cpp/test/core/mdspan_copy.cu +++ b/cpp/test/core/mdspan_copy.cu @@ -17,16 +17,16 @@ #include "../test_utils.h" #include #include +#include +#include #include #include -#include -#include #include namespace raft { TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) { - auto res = device_resources{}; + auto res = device_resources{}; auto constexpr const depth = std::uint32_t{50}; auto constexpr const rows = std::uint32_t{30}; auto constexpr const cols = std::uint32_t{20}; @@ -48,8 +48,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) res.sync_stream(); // Test dtype conversion without transpose - auto out_long = make_device_mdarray( - res, extents{}); + auto out_long = + make_device_mdarray( + res, extents{}); copy(res, out_long.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -89,10 +90,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) { - auto res = device_resources{}; - auto constexpr rows = std::uint32_t{30}; - auto constexpr cols = std::uint32_t{20}; - auto in_left = make_device_mdarray( + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( res, extents{}); auto in_right = make_device_mdarray( res, extents{}); @@ -118,8 +119,7 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_right(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } @@ -129,8 +129,7 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_right(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } copy(res, out_left.view(), in_right.view()); @@ -138,14 +137,13 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_left(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } } TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) { - auto res = device_resources{}; + auto res = device_resources{}; auto constexpr const depth = std::uint32_t{50}; auto constexpr const rows = std::uint32_t{30}; auto constexpr const cols = std::uint32_t{20}; @@ -167,8 +165,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) res.sync_stream(); // Test dtype conversion without transpose - auto out_long = make_host_mdarray( - res, extents{}); + auto out_long = + make_host_mdarray( + res, extents{}); RAFT_LOG_WARN("BEGIN dtype conversion without transpose"); copy(res, out_long.view(), in_left.view()); res.sync_stream(); @@ -210,10 +209,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) { - auto res = device_resources{}; - auto constexpr rows = std::uint32_t{30}; - auto constexpr cols = std::uint32_t{20}; - auto in_left = make_host_mdarray( + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_host_mdarray( res, extents{}); auto in_right = make_host_mdarray( res, extents{}); @@ -239,8 +238,7 @@ TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_right(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } @@ -250,8 +248,7 @@ TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_right(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } copy(res, out_left.view(), in_right.view()); @@ -259,15 +256,14 @@ TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_left(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } } TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) { - auto res = device_resources{}; + auto res = device_resources{}; auto constexpr const depth = std::uint32_t{50}; auto constexpr const rows = std::uint32_t{30}; auto constexpr const cols = std::uint32_t{20}; @@ -289,8 +285,9 @@ TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) res.sync_stream(); // Test dtype conversion without transpose - auto out_long = make_device_mdarray( - res, extents{}); + auto out_long = + make_device_mdarray( + res, extents{}); copy(res, out_long.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -330,10 +327,10 @@ TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) { - auto res = device_resources{}; - auto constexpr rows = std::uint32_t{30}; - auto constexpr cols = std::uint32_t{20}; - auto in_left = make_device_mdarray( + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( res, extents{}); auto in_right = make_device_mdarray( res, extents{}); @@ -359,8 +356,7 @@ TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_right(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } @@ -370,8 +366,7 @@ TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_right(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } copy(res, out_left.view(), in_right.view()); @@ -379,11 +374,9 @@ TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) for (auto i = std::uint32_t{}; i < rows; ++i) { for (auto j = std::uint32_t{}; j < cols; ++j) { ASSERT_TRUE(match( - double(out_left(i, j)), double(gen_unique_entry(i, j)), - CompareApprox{0.0001})); + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } } - } // namespace raft From 2a83c1bee9d62c3117bebdd3d0502b96133884df Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 11 Sep 2023 19:04:16 -0400 Subject: [PATCH 55/75] Remove mdbuffer work and document mdspan copy --- cpp/include/raft/core/copy.cuh | 47 +++ cpp/include/raft/core/copy.hpp | 46 +++ cpp/include/raft/core/detail/copy.hpp | 66 +--- cpp/include/raft/core/mdbuffer.hpp | 443 -------------------------- cpp/test/core/mdbuffer.cpp | 66 ---- cpp/test/core/mdbuffer.cu | 23 -- 6 files changed, 95 insertions(+), 596 deletions(-) delete mode 100644 cpp/include/raft/core/mdbuffer.hpp delete mode 100644 cpp/test/core/mdbuffer.cpp delete mode 100644 cpp/test/core/mdbuffer.cu diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh index f3b25f8a45..2e5b0f9a46 100644 --- a/cpp/include/raft/core/copy.cuh +++ b/cpp/include/raft/core/copy.cuh @@ -1,6 +1,53 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include namespace raft { +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. Any necessary device operations will be stream-ordered via the CUDA stream + * provided by the `raft::resources` argument. + * + * This header includes a custom kernel used for copying data between + * completely arbitrary mdspans on device. To compile this function in a + * non-CUDA translation unit, `raft/core/copy.hpp` may be used instead. The + * pure C++ header will correctly compile even without a CUDA compiler. + * Depending on the specialization, this CUDA header may invoke the kernel and + * therefore require a CUDA compiler. + * + * + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ template detail::mdspan_copyable_with_kernel_t copy(resources const& res, DstType&& dst, diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp index f8854b3374..4662ed5655 100644 --- a/cpp/include/raft/core/copy.hpp +++ b/cpp/include/raft/core/copy.hpp @@ -1,9 +1,55 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include namespace raft { #ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED #define RAFT_NON_CUDA_COPY_IMPLEMENTED +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. + * + * This header does _not_ include the custom kernel used for copying data + * between completely arbitrary mdspans on device. For arbitrary copies of this + * kind, `#include ` instead. Specializations of this + * function that require the custom kernel will be SFINAE-omitted when this + * header is used instead of `copy.cuh`. This header _does_ support + * device-to-device copies that can be performed with cuBLAS or a + * straightforward cudaMemcpy. Any necessary device operations will be stream-ordered via the CUDA + * stream provided by the `raft::resources` argument. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ template detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, DstType&& dst, diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 3c820a005e..1c0c258da1 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -281,16 +281,12 @@ __device__ auto increment_indices(IdxType* indices, auto cur_index = IdxType{}; - // printf("pre-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], - // int(increment)); while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) { increment -= index_strides[real_index]; ++cur_index; } indices[real_index] = cur_index; } - // printf("post-increment: %d %d %d: %d\n", old_indices[0], old_indices[1], old_indices[2], - // int(increment)); return increment == IdxType{}; } @@ -332,54 +328,6 @@ __global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(Ds // The index of the first element in the mdspan which will be copied via // the current tile for this block. typename config::index_type tile_offset[config::dst_rank] = {0}; - /* // 0 0 0 - increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - typename config::index_type{0} - ); - // 1 0 0 - increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - typename config::index_type{1} - ); - // 2 0 0 - increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - typename config::index_type{1} - ); - // 3 0 0 - increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - typename config::index_type{1} - ); - // 4 0 0 - increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - typename config::index_type{1} - ); - // 0 1 0 - increment_indices( - tile_offset, - src, - tile_offset, - index_strides, - typename config::index_type{1} - ); */ typename config::index_type cur_indices[config::dst_rank]; auto valid_tile = increment_indices( tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems); @@ -404,11 +352,7 @@ __global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(Ds get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y]; } } else { - if (valid_index) { - // printf("read: %d %d %d -> %d %d: %d\n", cur_indices[0], cur_indices[1], cur_indices[2], - // tile_read_x, tile_read_y, int(get_mdspan_elem(src, cur_indices))); - tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); - } + if (valid_index) { tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } __syncthreads(); valid_index = increment_indices(cur_indices, @@ -416,13 +360,7 @@ __global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(Ds tile_offset, index_strides, tile_read_y * mdspan_copy_tile_dim + tile_read_x); - if (valid_index) { - // printf("write: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], - // cur_indices[1], cur_indices[2], int(tile[tile_read_y][tile_read_x])); - get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; - // printf("final: %d %d -> %d %d %d: %d\n", tile_read_x, tile_read_y, cur_indices[0], - // cur_indices[1], cur_indices[2], int(get_mdspan_elem(dst, cur_indices))); - } + if (valid_index) { get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; } __syncthreads(); } valid_tile = increment_indices( diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp deleted file mode 100644 index 844a8a2c45..0000000000 --- a/cpp/include/raft/core/mdbuffer.hpp +++ /dev/null @@ -1,443 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef RAFT_DISABLE_CUDA -#include -#include -#endif - -namespace raft { - -inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type) -{ - return static_cast>(mem_type); -} - -template -using alternate_from_mem_type = - std::variant_alternative_t; - -template -using default_container_policy_variant = std::variant, - device_uvector_policy, - managed_uvector_policy, - pinned_vector_policy>; - -template > -struct universal_buffer_reference { - using value_type = typename std::remove_cv_t; - using pointer = value_type*; - using const_pointer = value_type const*; - - universal_buffer_reference(pointer ptr, - memory_type mem_type, - stream_view stream = stream_view_per_thread) - : ptr_{ptr}, mem_type_{mem_type}, stream_{stream} - { - } - -#ifndef RAFT_DISABLE_CUDA - explicit universal_buffer_reference(thrust::device_ptr ptr, - memory_type mem_type = memory_type::device, - stream_view stream = stream_view_per_thread) - : universal_buffer_reference{ptr.get(), mem_type, stream} - { - RAFT_EXPECTS(is_device_accessible(mem_type), - "Attempted to create host-only reference from Thrust device pointer"); - } -#endif - - operator value_type() const // NOLINT - { - auto result = value_type{}; - if (is_host_accessible(mem_type_)) { - result = *ptr_; - } else { -#ifdef RAFT_DISABLE_CUDA - throw non_cuda_build_error{"Attempted to access device reference in non-CUDA build"}; -#else - update_host(&result, ptr_, 1, stream_); -#endif - } - return result; - } - - auto operator=(value_type const& other) -> universal_buffer_reference& - { - if (is_host_accessible(mem_type_)) { - *ptr_ = other; - } else { -#ifdef RAFT_DISABLE_CUDA - throw non_cuda_build_error{"Attempted to assign to device reference in non-CUDA build"}; -#else - update_device(ptr_, &other, 1, stream_); -#endif - } - return *this; - } - - private: - pointer ptr_; - raft::memory_type mem_type_; - raft::stream_view stream_; -}; - -template > -struct default_buffer_container_policy { - using element_type = ElementType; - using value_type = std::remove_cv_t; - - using reference = universal_buffer_reference; - using const_reference = universal_buffer_reference; - using pointer = element_type*; - using const_pointer = element_type const*; - - using container_policy_variant = ContainerPolicyVariant; - - template - using container_policy = - host_device_accessor, MemType>; - - private: - template - using container_policy_at_index = std::variant_alternative_t; - - public: - using container_type_variant = - std::variant::container_type, - typename container_policy_at_index<1>::container_type, - typename container_policy_at_index<2>::container_type, - typename container_policy_at_index<3>::container_type>; - - template - using container_type = alternate_from_mem_type; - - using accessor_policy_variant = - std::variant::accessor_policy, - typename container_policy_at_index<1>::accessor_policy, - typename container_policy_at_index<2>::accessor_policy, - typename container_policy_at_index<3>::accessor_policy>; - - template - using accessor_policy = alternate_from_mem_type; - - using const_accessor_policy_variant = - std::variant::const_accessor_policy, - typename container_policy_at_index<1>::const_accessor_policy, - typename container_policy_at_index<2>::const_accessor_policy, - typename container_policy_at_index<3>::const_accessor_policy>; - - template - using const_accessor_policy = alternate_from_mem_type; - - template - auto create(raft::resources const& res, size_t n) - { - return container_type(res, n); - } - - auto create(raft::resources const& res, size_t n, raft::memory_type mem_type) - { - auto result = container_type_variant{}; - switch (mem_type) { - case raft::memory_type::host: result = create(res, n); break; - case raft::memory_type::device: result = create(res, n); break; - case raft::memory_type::managed: result = create(res, n); break; - case raft::memory_type::pinned: result = create(res, n); break; - } - return result; - } - - private: - template - auto static constexpr has_stream() -> decltype(std::declval().stream(), bool()) - { - return true; - }; - auto static constexpr has_stream(...) -> bool { return false; }; - - public: - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept - { - return reference{c.data() + n, MemType, c.stream()}; - } - - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type& c, std::size_t n) const noexcept - { - return reference{c.data() + n, MemType}; - } - - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type const& c, - std::size_t n) const noexcept - { - return const_reference{c.data() + n, MemType, c.stream()}; - } - - template >()>* = nullptr> - [[nodiscard]] auto constexpr access(container_type const& c, - std::size_t n) const noexcept - { - return const_reference{c.data() + n, MemType}; - } - - template - [[nodiscard]] auto make_accessor_policy() noexcept - { - return accessor_policy{}; - } - template - [[nodiscard]] auto make_accessor_policy() const noexcept - { - return const_accessor_policy{}; - } - - [[nodiscard]] auto make_accessor_policy(memory_type mem_type) noexcept - { - auto result = accessor_policy_variant{}; - switch (mem_type) { - case memory_type::host: result = make_accessor_policy(); break; - case memory_type::device: result = make_accessor_policy(); break; - case memory_type::managed: result = make_accessor_policy(); break; - case memory_type::pinned: result = make_accessor_policy(); break; - } - return result; - } - [[nodiscard]] auto make_accessor_policy(memory_type mem_type) const noexcept - { - auto result = const_accessor_policy_variant{}; - switch (mem_type) { - case memory_type::host: result = make_accessor_policy(); break; - case memory_type::device: result = make_accessor_policy(); break; - case memory_type::managed: result = make_accessor_policy(); break; - case memory_type::pinned: result = make_accessor_policy(); break; - } - return result; - } -}; - -template > -struct mdbuffer { - using extents_type = Extents; - using layout_type = LayoutPolicy; - using mapping_type = typename layout_type::template mapping; - using element_type = ElementType; - - using value_type = std::remove_cv_t; - using index_type = typename extents_type::index_type; - using difference_type = std::ptrdiff_t; - using rank_type = typename extents_type::rank_type; - - using container_policy_type = ContainerPolicy; - using accessor_policy_variant = typename ContainerPolicy::accessor_policy_variant; - - template - using accessor_policy = alternate_from_mem_type; - - using container_type_variant = typename container_policy_type::container_type_variant; - - template - using container_type = typename container_policy_type::template container_type; - - using pointer = typename container_policy_type::pointer; - using const_pointer = typename container_policy_type::const_pointer; - using reference = typename container_policy_type::reference; - using const_reference = typename container_policy_type::const_reference; - - template - using owning_type = mdarray>; - using owning_type_variant = std::variant(0)>, - owning_type(1)>, - owning_type(2)>, - owning_type(3)>>; - - template - using view_type = typename owning_type::view_type; - - using view_type_variant = std::variant(0)>, - view_type(1)>, - view_type(2)>, - view_type(3)>>; - - template - using const_view_type = typename owning_type::const_view_type; - using const_view_type_variant = std::variant(0)>, - const_view_type(1)>, - const_view_type(2)>, - const_view_type(3)>>; - - using storage_type_variant = concatenated_variant_t; - - template - using storage_type = - std::variant_alternative_t + - std::size_t{variant_index_from_memory_type(MemType)}, - storage_type_variant>; - - constexpr mdbuffer() = default; - - template , - storage_type_variant>>* = nullptr> - constexpr mdbuffer(mdspan other) - : data_{std::move(other)} - { - } - - template , - storage_type_variant>>* = nullptr> - constexpr mdbuffer(mdspan other, - memory_type mem_type) - : data_{[mem_type]() { - auto result = storage_type_variant{}; - if constexpr (AccessorPolicy::is_host_device_accessible()) { - if (mem_type != memory_type::host || mem_type != memory_type::device || - mem_type != memory_type::managed) { - // TODO(wphicks): Build owning variant and copy - } - } else if constexpr (AccessorPolicy::is_host_accessible()) { - if (mem_type != memory_type::host) { - // TODO(wphicks): Build owning variant and copy - } - } else if constexpr (AccessorPolicy::is_device_accessible()) { - if (mem_type != memory_type::device) { - // TODO(wphicks): Build owning variant and copy - } - } - return result; - }()} - { - } - - template ::view_type, - storage_type_variant>>* = nullptr> - constexpr mdbuffer(mdarray& other) - : mdbuffer{other.view()} - { - } - - template , - storage_type_variant>>* = nullptr> - constexpr mdbuffer(mdarray&& other) - : data_{std::move(other)} - { - } - - template < - typename OtherElementType = ElementType, - typename OtherExtents = Extents, - typename OtherLayoutPolicy = LayoutPolicy, - typename OtherContainerPolicy = ContainerPolicy, - std::enable_if_t, - Extents::rank() == OtherExtents::rank()>>* = nullptr> - constexpr mdbuffer( - resources const& res, - mdbuffer const& other) - : data_{other.data_} - { - } - - [[nodiscard]] auto constexpr mem_type() - { - return static_cast(data_.index() % std::variant_size_v); - }; - [[nodiscard]] auto constexpr is_owning() - { - return data_.index() >= std::variant_size_v; - }; - [[nodiscard]] auto constexpr data_handle() - { - return fast_visit( - [](auto&& inner) { - if constexpr (std::is_convertible_v) { - return pointer{inner.data_handle()}; - } else { - return pointer{inner.data_handle().get()}; - } - }, - data_); - }; - [[nodiscard]] auto constexpr data_handle() const - { - return fast_visit( - [](auto&& inner) { - if constexpr (std::is_convertible_v) { - return const_pointer{inner.data_handle()}; - } else { - return const_pointer{inner.data_handle().get()}; - } - }, - data_); - } - - private: - static auto constexpr get_view_from_data(view_type_variant const& data) { return data; } - static auto constexpr get_view_from_data(const_view_type_variant const& data) { return data; } - static auto constexpr get_view_from_data(owning_type_variant& data) - { - return view_type_variant{data.view()}; - } - static auto constexpr get_view_from_data(owning_type_variant const& data) - { - return const_view_type_variant{data.view()}; - } - - public: - [[nodiscard]] auto view() - { - return fast_visit([](auto&& inner) { return get_view_from_data(inner); }, data_); - } - - private: - storage_type_variant data_{}; -}; - -} // namespace raft diff --git a/cpp/test/core/mdbuffer.cpp b/cpp/test/core/mdbuffer.cpp deleted file mode 100644 index 72b7264bd7..0000000000 --- a/cpp/test/core/mdbuffer.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#ifndef RAFT_DISABLE_CUDA -#include -#endif -namespace raft { -TEST(MDBuffer, DefaultConstructor) { - auto buf = mdbuffer>{}; -} - -TEST(MDBuffer, FromHost) { - auto res = raft::resources{}; - auto rows = 3; - auto features = 5; - auto matrix = make_host_matrix(res, rows, features); - auto buf = mdbuffer{matrix}; - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_FALSE(buf.is_owning()); - ASSERT_EQ(buf.data_handle(), matrix.data_handle()); - - auto* ptr = matrix.data_handle(); - buf = mdbuffer{std::move(matrix)}; - ASSERT_EQ(buf.mem_type(), memory_type::host); - ASSERT_TRUE(buf.is_owning()); - ASSERT_EQ(buf.data_handle(), ptr); -} - -TEST(MDBuffer, FromDevice) { - auto res = raft::resources{}; - auto rows = 3; - auto features = 5; - auto matrix = make_device_matrix(res, rows, features); - auto buf = mdbuffer{matrix}; - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_FALSE(buf.is_owning()); - ASSERT_EQ(buf.data_handle(), matrix.data_handle()); - - auto* ptr = matrix.data_handle(); - buf = mdbuffer{std::move(matrix)}; - ASSERT_EQ(buf.mem_type(), memory_type::device); - ASSERT_TRUE(buf.is_owning()); - ASSERT_EQ(buf.data_handle(), ptr); -} -} // namespace raft - diff --git a/cpp/test/core/mdbuffer.cu b/cpp/test/core/mdbuffer.cu deleted file mode 100644 index 4843f0616d..0000000000 --- a/cpp/test/core/mdbuffer.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -namespace raft { -} // namespace raft From 624e4f3502cd37094ba3fba734483f4d92765e28 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 12 Sep 2023 10:49:30 -0400 Subject: [PATCH 56/75] Remove un-needed changes left over from mdbuffer --- build.sh | 7 - ci/build_cpp.sh | 2 +- cpp/CMakeLists.txt | 63 +- cpp/cmake/thirdparty/get_fmt.cmake | 22 - cpp/cmake/thirdparty/get_spdlog.cmake | 33 - .../core/detail/fail_container_policy.hpp | 159 ----- cpp/include/raft/util/variant_utils.hpp | 55 -- cpp/internal/CMakeLists.txt | 6 +- cpp/test/CMakeLists.txt | 670 +++++++++--------- 9 files changed, 352 insertions(+), 665 deletions(-) delete mode 100644 cpp/cmake/thirdparty/get_fmt.cmake delete mode 100644 cpp/cmake/thirdparty/get_spdlog.cmake delete mode 100644 cpp/include/raft/core/detail/fail_container_policy.hpp delete mode 100644 cpp/include/raft/util/variant_utils.hpp diff --git a/build.sh b/build.sh index 4ed1096b98..071820ba93 100755 --- a/build.sh +++ b/build.sh @@ -45,7 +45,6 @@ HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=) -set(RAFT_CTK_MATH_DEPENDENCIES "") -if(NOT DISABLE_CUDA) - target_compile_options( - raft INTERFACE $<$:--expt-extended-lambda - --expt-relaxed-constexpr> - ) +target_compile_options( + raft INTERFACE $<$:--expt-extended-lambda + --expt-relaxed-constexpr> +) - set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix}) - set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix}) - set(RAFT_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix}) - set(RAFT_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix}) +set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix}) +set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix}) +set(RAFT_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix}) +set(RAFT_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix}) - set(RAFT_CTK_MATH_DEPENDENCIES ${RAFT_CUBLAS_DEPENDENCY} ${RAFT_CUSOLVER_DEPENDENCY} - ${RAFT_CUSPARSE_DEPENDENCY} ${RAFT_CURAND_DEPENDENCY} - ) -endif() +set(RAFT_CTK_MATH_DEPENDENCIES ${RAFT_CUBLAS_DEPENDENCY} ${RAFT_CUSOLVER_DEPENDENCY} + ${RAFT_CUSPARSE_DEPENDENCY} ${RAFT_CURAND_DEPENDENCY} +) # Endian detection include(TestBigEndian) @@ -255,7 +249,7 @@ endif() # ################################################################################################## # * NVTX support in raft ----------------------------------------------------- -if(RAFT_NVTX AND (NOT DISABLE_CUDA)) +if(RAFT_NVTX) # This enables NVTX within the project with no option to disable it downstream. target_link_libraries(raft INTERFACE CUDA::nvToolsExt) target_compile_definitions(raft INTERFACE NVTX_ENABLED) @@ -531,35 +525,26 @@ target_link_libraries( # * raft_distributed ------------------------------------------------------------------------------- add_library(raft_distributed INTERFACE) -# No distributed support for CUDA-free builds yet -if(TARGET raft_distributed - AND (NOT TARGET raft::distributed) - AND (NOT DISABLE_CUDA) -) +if(TARGET raft_distributed AND (NOT TARGET raft::distributed)) add_library(raft::distributed ALIAS raft_distributed) endif() set_target_properties(raft_distributed PROPERTIES EXPORT_NAME distributed) -if(NOT RAFT_DISABLE_CUDA) - rapids_find_generate_module( - NCCL - HEADER_NAMES nccl.h - LIBRARY_NAMES nccl - BUILD_EXPORT_SET raft-distributed-exports - INSTALL_EXPORT_SET raft-distributed-exports - ) -endif() +rapids_find_generate_module( + NCCL + HEADER_NAMES nccl.h + LIBRARY_NAMES nccl + BUILD_EXPORT_SET raft-distributed-exports + INSTALL_EXPORT_SET raft-distributed-exports +) rapids_export_package(BUILD ucx raft-distributed-exports) rapids_export_package(INSTALL ucx raft-distributed-exports) -if(NOT RAFT_DISABLE_CUDA) - rapids_export_package(BUILD NCCL raft-distributed-exports) - rapids_export_package(INSTALL NCCL raft-distributed-exports) - target_link_libraries(raft_distributed INTERFACE ucx::ucp NCCL::NCCL) -else() - target_link_libraries(raft_distributed INTERFACE ucx::ucp) -endif() +rapids_export_package(BUILD NCCL raft-distributed-exports) +rapids_export_package(INSTALL NCCL raft-distributed-exports) + +target_link_libraries(raft_distributed INTERFACE ucx::ucp NCCL::NCCL) # ################################################################################################## # * install targets----------------------------------------------------------- diff --git a/cpp/cmake/thirdparty/get_fmt.cmake b/cpp/cmake/thirdparty/get_fmt.cmake deleted file mode 100644 index 5787fb73fb..0000000000 --- a/cpp/cmake/thirdparty/get_fmt.cmake +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Use CPM to find or clone fmt -function(find_and_configure_fmt) - - include(${rapids-cmake-dir}/cpm/fmt.cmake) - rapids_cpm_fmt(INSTALL_EXPORT_SET rmm-exports BUILD_EXPORT_SET rmm-exports) -endfunction() - -find_and_configure_fmt() diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake deleted file mode 100644 index 24bbea89d5..0000000000 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ /dev/null @@ -1,33 +0,0 @@ -# ============================================================================= -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Use CPM to find or clone speedlog -function(find_and_configure_spdlog) - - include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports) - rapids_export_package(BUILD spdlog rmm-exports) - - if(spdlog_ADDED) - rapids_export( - BUILD spdlog - EXPORT_SET spdlog - GLOBAL_TARGETS spdlog spdlog_header_only - NAMESPACE spdlog::) - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] rmm-exports) - endif() -endfunction() - -find_and_configure_spdlog() diff --git a/cpp/include/raft/core/detail/fail_container_policy.hpp b/cpp/include/raft/core/detail/fail_container_policy.hpp deleted file mode 100644 index e468539a0d..0000000000 --- a/cpp/include/raft/core/detail/fail_container_policy.hpp +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include -#include -#include -#include -#include - -namespace raft { -namespace detail { - -template -struct fail_reference { - using value_type = typename std::remove_cv_t; - using pointer = T*; - using const_pointer = T const*; - - fail_reference() = default; - template - fail_reference(T* ptr, StreamViewType stream) { - throw non_cuda_build_error{ - "Attempted to construct reference to device data in non-CUDA build" - }; - } - - operator value_type() const // NOLINT - { - throw non_cuda_build_error{ - "Attempted to dereference device data in non-CUDA build" - }; - return value_type{}; - } - auto operator=(T const& other) -> fail_reference& - { - throw non_cuda_build_error{ - "Attempted to assign to device data in non-CUDA build" - }; - return *this; - } -}; - -/** A placeholder container which throws an exception on use - * - * This placeholder is used in non-CUDA builds for container types that would - * otherwise be provided with CUDA code. Attempting to construct a non-empty - * container of this type throws an exception indicating that there was an - * attempt to use the device from a non-CUDA build. An example of when this - * might happen is if a downstream application attempts to allocate a device - * mdarray using a library built with non-CUDA RAFT. - */ -template -struct fail_container { - using value_type = T; - using size_type = std::size_t; - - using reference = fail_reference; - using const_reference = fail_reference; - - using pointer = value_type*; - using const_pointer = value_type const*; - - using iterator = pointer; - using const_iterator = const_pointer; - - explicit fail_container(size_t n=size_t{}) { - if (n != size_t{}) { - throw non_cuda_build_error{ - "Attempted to allocate device container in non-CUDA build" - }; - } - } - - template - auto operator[](Index i) noexcept -> reference { - RAFT_LOG_ERROR( - "Attempted to access device data in non-CUDA build" - ); - return reference{}; - } - - template - auto operator[](Index i) const noexcept -> const_reference { - RAFT_LOG_ERROR( - "Attempted to access device data in non-CUDA build" - ); - return const_reference{}; - } - void resize(size_t n) { - if (n != size_t{}) { - throw non_cuda_build_error{ - "Attempted to allocate device container in non-CUDA build" - }; - } - } - - [[nodiscard]] auto data() noexcept -> pointer { return nullptr; } - [[nodiscard]] auto data() const noexcept -> const_pointer { return nullptr; } -}; - - -/** A placeholder container policy which throws an exception on use - * - * This placeholder is used in non-CUDA builds for container types that would - * otherwise be provided with CUDA code. Attempting to construct a non-empty - * container of this type throws an exception indicating that there was an - * attempt to use the device from a non-CUDA build. An example of when this - * might happen is if a downstream application attempts to allocate a device - * mdarray using a library built with non-CUDA RAFT. - */ -template -struct fail_container_policy { - using element_type = ElementType; - using container_type = fail_container; - using pointer = typename container_type::pointer; - using const_pointer = typename container_type::const_pointer; - using reference = typename container_type::reference; - using const_reference = typename container_type::const_reference; - - using accessor_policy = std::experimental::default_accessor; - using const_accessor_policy = std::experimental::default_accessor; - - auto create(raft::resources const& res, size_t n) -> container_type - { - return container_type(n); - } - - fail_container_policy() = default; - - [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference - { - return c[n]; - } - [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept - -> const_reference - { - return c[n]; - } - - [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } - [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } -}; - -} // namespace detail -} // namespace raft diff --git a/cpp/include/raft/util/variant_utils.hpp b/cpp/include/raft/util/variant_utils.hpp deleted file mode 100644 index d8c7a45efe..0000000000 --- a/cpp/include/raft/util/variant_utils.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -namespace raft { - -template -struct concatenated_variant; - -template -struct concatenated_variant , std::variant>{ - using type = std::variant; -}; - -template -using concatenated_variant_t = typename concatenated_variant::type; - -template -auto fast_visit (visitor_t&& visitor, variant_t&& variant) { - using return_t = decltype( - std::forward(visitor)(std::get<0>(variant)) - ); - auto result = return_t{}; - - if constexpr (index == std::variant_size_v>>) { - __builtin_unreachable(); - } else { - if (index == variant.index()) { - result = std::forward(visitor)(std::get(std::forward(variant))); - } else { - result = fast_visit( - std::forward(visitor), - std::forward(variant) - ); - } - } - return result; -} - -} // namespace raft diff --git a/cpp/internal/CMakeLists.txt b/cpp/internal/CMakeLists.txt index cae278aa9e..5d9e8c6f8b 100644 --- a/cpp/internal/CMakeLists.txt +++ b/cpp/internal/CMakeLists.txt @@ -17,9 +17,5 @@ if(BUILD_TESTS OR BUILD_PRIMS_BENCH) target_include_directories( raft_internal INTERFACE "$" ) - if(DISABLE_CUDA) - target_compile_features(raft_internal INTERFACE cxx_std_17) - else() - target_compile_features(raft_internal INTERFACE cxx_std_17 $) - endif() + target_compile_features(raft_internal INTERFACE cxx_std_17 $) endif() diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 6279501b67..22e0a2ceb7 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -52,33 +52,19 @@ function(ConfigureTest) $ $ ) - - if(DISABLE_CUDA) - set_target_properties( - ${TEST_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" - INSTALL_RPATH "\$ORIGIN/../../../lib" - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - ) - target_compile_options(${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>") - target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA") - else() - set_target_properties( - ${TEST_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" - INSTALL_RPATH "\$ORIGIN/../../../lib" - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - target_compile_options( - ${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" - "$<$:${RAFT_CUDA_FLAGS}>" - ) - endif() - + set_target_properties( + ${TEST_NAME} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options( + ${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" + "$<$:${RAFT_CUDA_FLAGS}>" + ) if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY) target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") endif() @@ -102,185 +88,181 @@ endfunction() # * distance tests ------------------------------------------------------------------------- if(BUILD_TESTS) - if(NOT DISABLE_CUDA) - ConfigureTest( - NAME - CLUSTER_TEST - PATH - test/cluster/kmeans.cu - test/cluster/kmeans_balanced.cu - test/cluster/cluster_solvers.cu - test/cluster/linkage.cu - test/cluster/kmeans_find_k.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - CORE_TEST - PATH - test/core/device_resources_manager.cpp - test/core/device_setter.cpp - test/core/logger.cpp - test/core/math_device.cu - test/core/math_host.cpp - test/core/operators_device.cu - test/core/operators_host.cpp - test/core/handle.cpp - test/core/interruptible.cu - test/core/nvtx.cpp - test/core/mdarray.cu - test/core/mdspan_copy.cpp - test/core/mdspan_copy.cu - test/core/mdspan_utils.cu - test/core/numpy_serializer.cu - test/core/memory_type.cpp - test/core/sparse_matrix.cu - test/core/sparse_matrix.cpp - test/core/span.cpp - test/core/span.cu - test/core/temporary_device_buffer.cu - test/test.cpp - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - DISTANCE_TEST - PATH - test/distance/dist_adj.cu - test/distance/dist_adj_distance_instance.cu - test/distance/dist_canberra.cu - test/distance/dist_correlation.cu - test/distance/dist_cos.cu - test/distance/dist_hamming.cu - test/distance/dist_hellinger.cu - test/distance/dist_inner_product.cu - test/distance/dist_jensen_shannon.cu - test/distance/dist_kl_divergence.cu - test/distance/dist_l1.cu - test/distance/dist_l2_exp.cu - test/distance/dist_l2_unexp.cu - test/distance/dist_l2_sqrt_exp.cu - test/distance/dist_l_inf.cu - test/distance/dist_lp_unexp.cu - test/distance/dist_russell_rao.cu - test/distance/masked_nn.cu - test/distance/masked_nn_compress_to_bits.cu - test/distance/fused_l2_nn.cu - test/distance/gram.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - list( - APPEND - EXT_HEADER_TEST_SOURCES - test/ext_headers/raft_neighbors_brute_force.cu - test/ext_headers/raft_distance_distance.cu - test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu - test/ext_headers/raft_matrix_detail_select_k.cu - test/ext_headers/raft_neighbors_ball_cover.cu - test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu - test/ext_headers/raft_distance_fused_l2_nn.cu - test/ext_headers/raft_neighbors_ivf_pq.cu - test/ext_headers/raft_util_memory_pool.cpp - test/ext_headers/raft_neighbors_ivf_flat.cu - test/ext_headers/raft_core_logger.cpp - test/ext_headers/raft_neighbors_refine.cu - test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu - test/ext_headers/raft_neighbors_detail_selection_faiss.cu - test/ext_headers/raft_linalg_detail_coalesced_reduction.cu - test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu - test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu - test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu - ) - - # Test that the split headers compile in isolation with: - # - # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined - # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined - # * EXT_HEADERS_TEST_IMPLICIT: no macros defined. - ConfigureTest( - NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB - EXPLICIT_INSTANTIATE_ONLY - ) - ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB) - ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES}) - - ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu) - - ConfigureTest( - NAME - LINALG_TEST - PATH - test/linalg/add.cu - test/linalg/axpy.cu - test/linalg/binary_op.cu - test/linalg/cholesky_r1.cu - test/linalg/coalesced_reduction.cu - test/linalg/divide.cu - test/linalg/dot.cu - test/linalg/eig.cu - test/linalg/eig_sel.cu - test/linalg/gemm_layout.cu - test/linalg/gemv.cu - test/linalg/map.cu - test/linalg/map_then_reduce.cu - test/linalg/matrix_vector.cu - test/linalg/matrix_vector_op.cu - test/linalg/mean_squared_error.cu - test/linalg/multiply.cu - test/linalg/norm.cu - test/linalg/normalize.cu - test/linalg/power.cu - test/linalg/randomized_svd.cu - test/linalg/reduce.cu - test/linalg/reduce_cols_by_key.cu - test/linalg/reduce_rows_by_key.cu - test/linalg/rsvd.cu - test/linalg/sqrt.cu - test/linalg/strided_reduction.cu - test/linalg/subtract.cu - test/linalg/svd.cu - test/linalg/ternary_op.cu - test/linalg/transpose.cu - test/linalg/unary_op.cu - ) - - ConfigureTest( - NAME - MATRIX_TEST - PATH - test/matrix/argmax.cu - test/matrix/argmin.cu - test/matrix/columnSort.cu - test/matrix/diagonal.cu - test/matrix/gather.cu - test/matrix/scatter.cu - test/matrix/eye.cu - test/matrix/linewise_op.cu - test/matrix/math.cu - test/matrix/matrix.cu - test/matrix/norm.cu - test/matrix/reverse.cu - test/matrix/slice.cu - test/matrix/triangular.cu - test/sparse/spectral_matrix.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB - EXPLICIT_INSTANTIATE_ONLY - ) + ConfigureTest( + NAME + CLUSTER_TEST + PATH + test/cluster/kmeans.cu + test/cluster/kmeans_balanced.cu + test/cluster/cluster_solvers.cu + test/cluster/linkage.cu + test/cluster/kmeans_find_k.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + CORE_TEST + PATH + test/core/device_resources_manager.cpp + test/core/device_setter.cpp + test/core/logger.cpp + test/core/math_device.cu + test/core/math_host.cpp + test/core/operators_device.cu + test/core/operators_host.cpp + test/core/handle.cpp + test/core/interruptible.cu + test/core/nvtx.cpp + test/core/mdarray.cu + test/core/mdspan_copy.cpp + test/core/mdspan_copy.cu + test/core/mdspan_utils.cu + test/core/numpy_serializer.cu + test/core/memory_type.cpp + test/core/sparse_matrix.cu + test/core/sparse_matrix.cpp + test/core/span.cpp + test/core/span.cu + test/core/temporary_device_buffer.cu + test/test.cpp + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + DISTANCE_TEST + PATH + test/distance/dist_adj.cu + test/distance/dist_adj_distance_instance.cu + test/distance/dist_canberra.cu + test/distance/dist_correlation.cu + test/distance/dist_cos.cu + test/distance/dist_hamming.cu + test/distance/dist_hellinger.cu + test/distance/dist_inner_product.cu + test/distance/dist_jensen_shannon.cu + test/distance/dist_kl_divergence.cu + test/distance/dist_l1.cu + test/distance/dist_l2_exp.cu + test/distance/dist_l2_unexp.cu + test/distance/dist_l2_sqrt_exp.cu + test/distance/dist_l_inf.cu + test/distance/dist_lp_unexp.cu + test/distance/dist_russell_rao.cu + test/distance/masked_nn.cu + test/distance/masked_nn_compress_to_bits.cu + test/distance/fused_l2_nn.cu + test/distance/gram.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + list( + APPEND + EXT_HEADER_TEST_SOURCES + test/ext_headers/raft_neighbors_brute_force.cu + test/ext_headers/raft_distance_distance.cu + test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu + test/ext_headers/raft_matrix_detail_select_k.cu + test/ext_headers/raft_neighbors_ball_cover.cu + test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu + test/ext_headers/raft_distance_fused_l2_nn.cu + test/ext_headers/raft_neighbors_ivf_pq.cu + test/ext_headers/raft_util_memory_pool.cpp + test/ext_headers/raft_neighbors_ivf_flat.cu + test/ext_headers/raft_core_logger.cpp + test/ext_headers/raft_neighbors_refine.cu + test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu + test/ext_headers/raft_neighbors_detail_selection_faiss.cu + test/ext_headers/raft_linalg_detail_coalesced_reduction.cu + test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu + test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu + test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu + ) + + # Test that the split headers compile in isolation with: + # + # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined + # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined + # * EXT_HEADERS_TEST_IMPLICIT: no macros defined. + ConfigureTest( + NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB + EXPLICIT_INSTANTIATE_ONLY + ) + ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB) + ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES}) + + ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu) + + ConfigureTest( + NAME + LINALG_TEST + PATH + test/linalg/add.cu + test/linalg/axpy.cu + test/linalg/binary_op.cu + test/linalg/cholesky_r1.cu + test/linalg/coalesced_reduction.cu + test/linalg/divide.cu + test/linalg/dot.cu + test/linalg/eig.cu + test/linalg/eig_sel.cu + test/linalg/gemm_layout.cu + test/linalg/gemv.cu + test/linalg/map.cu + test/linalg/map_then_reduce.cu + test/linalg/matrix_vector.cu + test/linalg/matrix_vector_op.cu + test/linalg/mean_squared_error.cu + test/linalg/multiply.cu + test/linalg/norm.cu + test/linalg/normalize.cu + test/linalg/power.cu + test/linalg/randomized_svd.cu + test/linalg/reduce.cu + test/linalg/reduce_cols_by_key.cu + test/linalg/reduce_rows_by_key.cu + test/linalg/rsvd.cu + test/linalg/sqrt.cu + test/linalg/strided_reduction.cu + test/linalg/subtract.cu + test/linalg/svd.cu + test/linalg/ternary_op.cu + test/linalg/transpose.cu + test/linalg/unary_op.cu + ) + + ConfigureTest( + NAME + MATRIX_TEST + PATH + test/matrix/argmax.cu + test/matrix/argmin.cu + test/matrix/columnSort.cu + test/matrix/diagonal.cu + test/matrix/gather.cu + test/matrix/scatter.cu + test/matrix/eye.cu + test/matrix/linewise_op.cu + test/matrix/math.cu + test/matrix/matrix.cu + test/matrix/norm.cu + test/matrix/reverse.cu + test/matrix/slice.cu + test/matrix/triangular.cu + test/sparse/spectral_matrix.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY) + + ConfigureTest( + NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY + ) ConfigureTest( NAME @@ -298,144 +280,144 @@ if(BUILD_TESTS) test/random/sample_without_replacement.cu ) - ConfigureTest( - NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu - test/linalg/eigen_solvers.cu test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - SPARSE_TEST - PATH - test/sparse/add.cu - test/sparse/convert_coo.cu - test/sparse/convert_csr.cu - test/sparse/csr_row_slice.cu - test/sparse/csr_to_dense.cu - test/sparse/csr_transpose.cu - test/sparse/degree.cu - test/sparse/filter.cu - test/sparse/norm.cu - test/sparse/normalize.cu - test/sparse/reduce.cu - test/sparse/row_op.cu - test/sparse/sort.cu - test/sparse/spgemmi.cu - test/sparse/symmetrize.cu - ) - - ConfigureTest( - NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu - test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - SPARSE_NEIGHBORS_TEST - PATH - test/sparse/neighbors/cross_component_nn.cu - test/sparse/neighbors/brute_force.cu - test/sparse/neighbors/knn_graph.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - NEIGHBORS_TEST - PATH - test/neighbors/knn.cu - test/neighbors/fused_l2_knn.cu - test/neighbors/tiled_knn.cu - test/neighbors/haversine.cu - test/neighbors/ball_cover.cu - test/neighbors/epsilon_neighborhood.cu - test/neighbors/refine.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureTest( - NAME - NEIGHBORS_ANN_CAGRA_TEST - PATH - test/neighbors/ann_cagra/test_float_uint32_t.cu - test/neighbors/ann_cagra/test_int8_t_uint32_t.cu - test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu - test/neighbors/ann_cagra/test_float_int64_t.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu - src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - GPUS - 1 - PERCENT - 100 - ) - - ConfigureTest( - NAME - NEIGHBORS_ANN_IVF_TEST - PATH - test/neighbors/ann_ivf_flat/test_float_int64_t.cu - test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu - test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu - test/neighbors/ann_ivf_pq/test_float_int64_t.cu - test/neighbors/ann_ivf_pq/test_float_uint32_t.cu - test/neighbors/ann_ivf_pq/test_float_int64_t.cu - test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu - test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - GPUS - 1 - PERCENT - 100 - ) - - ConfigureTest( - NAME NEIGHBORS_SELECTION_TEST PATH test/neighbors/selection.cu LIB EXPLICIT_INSTANTIATE_ONLY - GPUS 1 PERCENT 50 - ) - - ConfigureTest( - NAME - STATS_TEST - PATH - test/stats/accuracy.cu - test/stats/adjusted_rand_index.cu - test/stats/completeness_score.cu - test/stats/contingencyMatrix.cu - test/stats/cov.cu - test/stats/dispersion.cu - test/stats/entropy.cu - test/stats/histogram.cu - test/stats/homogeneity_score.cu - test/stats/information_criterion.cu - test/stats/kl_divergence.cu - test/stats/mean.cu - test/stats/meanvar.cu - test/stats/mean_center.cu - test/stats/minmax.cu - test/stats/mutual_info_score.cu - test/stats/r2_score.cu - test/stats/rand_index.cu - test/stats/regression_metrics.cu - test/stats/silhouette_score.cu - test/stats/stddev.cu - test/stats/sum.cu - test/stats/trustworthiness.cu - test/stats/weighted_mean.cu - test/stats/v_measure.cu - LIB - EXPLICIT_INSTANTIATE_ONLY - ) + ConfigureTest( + NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu + test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + SPARSE_TEST + PATH + test/sparse/add.cu + test/sparse/convert_coo.cu + test/sparse/convert_csr.cu + test/sparse/csr_row_slice.cu + test/sparse/csr_to_dense.cu + test/sparse/csr_transpose.cu + test/sparse/degree.cu + test/sparse/filter.cu + test/sparse/norm.cu + test/sparse/normalize.cu + test/sparse/reduce.cu + test/sparse/row_op.cu + test/sparse/sort.cu + test/sparse/spgemmi.cu + test/sparse/symmetrize.cu + ) + + ConfigureTest( + NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu + test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + SPARSE_NEIGHBORS_TEST + PATH + test/sparse/neighbors/cross_component_nn.cu + test/sparse/neighbors/brute_force.cu + test/sparse/neighbors/knn_graph.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + NEIGHBORS_TEST + PATH + test/neighbors/knn.cu + test/neighbors/fused_l2_knn.cu + test/neighbors/tiled_knn.cu + test/neighbors/haversine.cu + test/neighbors/ball_cover.cu + test/neighbors/epsilon_neighborhood.cu + test/neighbors/refine.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + NEIGHBORS_ANN_CAGRA_TEST + PATH + test/neighbors/ann_cagra/test_float_uint32_t.cu + test/neighbors/ann_cagra/test_int8_t_uint32_t.cu + test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu + test/neighbors/ann_cagra/test_float_int64_t.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + GPUS + 1 + PERCENT + 100 + ) + + ConfigureTest( + NAME + NEIGHBORS_ANN_IVF_TEST + PATH + test/neighbors/ann_ivf_flat/test_float_int64_t.cu + test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu + test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu + test/neighbors/ann_ivf_pq/test_float_int64_t.cu + test/neighbors/ann_ivf_pq/test_float_uint32_t.cu + test/neighbors/ann_ivf_pq/test_float_int64_t.cu + test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu + test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + GPUS + 1 + PERCENT + 100 + ) + + ConfigureTest( + NAME NEIGHBORS_SELECTION_TEST PATH test/neighbors/selection.cu LIB EXPLICIT_INSTANTIATE_ONLY + GPUS 1 PERCENT 50 + ) + + ConfigureTest( + NAME + STATS_TEST + PATH + test/stats/accuracy.cu + test/stats/adjusted_rand_index.cu + test/stats/completeness_score.cu + test/stats/contingencyMatrix.cu + test/stats/cov.cu + test/stats/dispersion.cu + test/stats/entropy.cu + test/stats/histogram.cu + test/stats/homogeneity_score.cu + test/stats/information_criterion.cu + test/stats/kl_divergence.cu + test/stats/mean.cu + test/stats/meanvar.cu + test/stats/mean_center.cu + test/stats/minmax.cu + test/stats/mutual_info_score.cu + test/stats/r2_score.cu + test/stats/rand_index.cu + test/stats/regression_metrics.cu + test/stats/silhouette_score.cu + test/stats/stddev.cu + test/stats/sum.cu + test/stats/trustworthiness.cu + test/stats/weighted_mean.cu + test/stats/v_measure.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + ) ConfigureTest( NAME From e9ef75071b138b75725d5af51477716ff75e0ffe Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 12 Sep 2023 13:17:26 -0400 Subject: [PATCH 57/75] Add testing for CUDA-disabled builds --- cpp/include/raft/core/detail/copy.hpp | 18 +++ .../raft/core/resource/stream_view.hpp | 9 +- cpp/test/CMakeLists.txt | 17 ++- cpp/test/core/mdspan_copy.cpp | 104 ++---------------- cpp/test/core/mdspan_copy.cu | 3 - 5 files changed, 45 insertions(+), 106 deletions(-) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 1c0c258da1..4c65ea6027 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -378,6 +379,7 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr } if constexpr (config::use_intermediate_src) { +#ifndef RAFT_DISABLE_CUDA // Copy to intermediate source on device, then perform necessary // changes in layout on device, directly into final destination using mdarray_t = device_mdarray copy(resources const& res, DstType&& dst, Sr typename mdarray_t::container_policy_type{}); detail::copy(res, intermediate.view(), src); detail::copy(res, dst, intermediate.view()); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to device in non-CUDA build")); +#endif } else if constexpr (config::use_intermediate_dst) { +#ifndef RAFT_DISABLE_CUDA // Perform necessary changes in layout on device, then copy to final // destination on host using mdarray_t = device_mdarray copy(resources const& res, DstType&& dst, Sr typename mdarray_t::container_policy_type{}); detail::copy(res, intermediate.view(), src); detail::copy(res, dst, intermediate.view()); +#else + throw(raft::non_cuda_build_error("Copying from device in non-CUDA build")); +#endif } else if constexpr (config::can_use_raft_copy) { #ifndef RAFT_DISABLE_CUDA raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res)); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); #endif } else if constexpr (config::can_use_cublas) { +#ifndef RAFT_DISABLE_CUDA auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; auto constexpr const beta = typename std::remove_reference_t::value_type{0}; if constexpr (std::is_same_v) { @@ -438,6 +452,10 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr dst.extent(0), resource::get_cuda_stream(res))); } +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif } else if constexpr (config::custom_kernel_allowed) { #ifdef __CUDACC__ auto const blocks = std::min( diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp index 42278f779c..ed7129b622 100644 --- a/cpp/include/raft/core/resource/stream_view.hpp +++ b/cpp/include/raft/core/resource/stream_view.hpp @@ -23,9 +23,7 @@ namespace raft::resource { struct stream_view_resource : public resource { - stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) - { - } + stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) {} void* get_resource() override { return &stream; } ~stream_view_resource() override {} @@ -40,8 +38,7 @@ struct stream_view_resource : public resource { */ struct stream_view_resource_factory : public resource_factory { public: - stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) - : stream(view) + stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) : stream(view) { } resource_type get_resource_type() override { return resource_type::STREAM_VIEW; } @@ -95,7 +92,7 @@ inline void sync_stream_view(const resources& res, raft::stream_view stream) /** * @brief synchronize main stream on the resources instance */ -inline void sync_stream_view(const resources& res) { sync_stream(res, get_stream_view(res)); } +inline void sync_stream_view(const resources& res) { sync_stream_view(res, get_stream_view(res)); } /** * @} diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 22e0a2ceb7..cd87424a0d 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -21,7 +21,7 @@ rapids_test_init() function(ConfigureTest) - set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY) + set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY NOCUDA) set(oneValueArgs NAME GPUS PERCENT) set(multiValueArgs PATH TARGETS CONFIGURATIONS) @@ -37,7 +37,11 @@ function(ConfigureTest) set(_RAFT_TEST_PERCENT 100) endif() - set(TEST_NAME ${_RAFT_TEST_NAME}) + if(_RAFT_TEST_NOCUDA) + set(TEST_NAME "${_RAFT_TEST_NAME}_NOCUDA") + else() + set(TEST_NAME ${_RAFT_TEST_NAME}) + endif() add_executable(${TEST_NAME} ${_RAFT_TEST_PATH}) target_link_libraries( @@ -68,6 +72,9 @@ function(ConfigureTest) if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY) target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") endif() + if(_RAFT_TEST_NOCUDA) + target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA") + endif() target_include_directories(${TEST_NAME} PUBLIC "$") @@ -125,12 +132,18 @@ if(BUILD_TESTS) test/core/sparse_matrix.cpp test/core/span.cpp test/core/span.cu + test/core/stream_view.cpp test/core/temporary_device_buffer.cu test/test.cpp LIB EXPLICIT_INSTANTIATE_ONLY ) + ConfigureTest( + NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB + EXPLICIT_INSTANTIATE_ONLY NOCUDA + ) + ConfigureTest( NAME DISTANCE_TEST diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp index bb11b8dadc..b64ad0355b 100644 --- a/cpp/test/core/mdspan_copy.cpp +++ b/cpp/test/core/mdspan_copy.cpp @@ -18,14 +18,17 @@ #include #include #include +#ifndef RAFT_DISABLE_CUDA #include #include +#endif #include +#include namespace raft { TEST(MDSpanCopy, Mdspan1DHostHost) { - auto res = device_resources{}; + auto res = resources{}; auto cols = std::uint32_t{2}; auto in_left = make_host_vector(res, cols); @@ -42,6 +45,7 @@ TEST(MDSpanCopy, Mdspan1DHostHost) } } +#ifndef RAFT_DISABLE_CUDA TEST(MDSpanCopy, Mdspan1DHostDevice) { auto res = device_resources{}; @@ -83,10 +87,11 @@ TEST(MDSpanCopy, Mdspan1DDeviceHost) match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); } } +#endif TEST(MDSpanCopy, Mdspan3DHostHost) { - auto res = device_resources{}; + auto res = resources{}; auto constexpr depth = std::uint32_t{500}; auto constexpr rows = std::uint32_t{300}; auto constexpr cols = std::uint32_t{200}; @@ -155,6 +160,7 @@ TEST(MDSpanCopy, Mdspan3DHostHost) } } +#ifndef RAFT_DISABLE_CUDA TEST(MDSpanCopy, Mdspan3DHostDevice) { auto res = device_resources{}; @@ -197,28 +203,6 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) } } - /* copy(res, out_right.view(), in_left.view()); - res.sync_stream(); - for (auto i = std::uint32_t{}; i < depth; ++i) { - for (auto j = std::uint32_t{}; j < rows; ++j) { - for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); - } - } - } */ - - /* copy(res, out_left.view(), in_right.view()); - res.sync_stream(); - for (auto i = std::uint32_t{}; i < depth; ++i) { - for (auto j = std::uint32_t{}; j < rows; ++j) { - for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); - } - } - } */ - // raft::copy copy(res, out_left.view(), in_left.view()); res.sync_stream(); @@ -286,76 +270,6 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) } } } - -/* TEST(MDSpanCopy, Mdspan3DDeviceDevice) -{ - auto res = device_resources{}; - auto constexpr depth = std::uint32_t{50}; - auto constexpr rows = std::uint32_t{30}; - auto constexpr cols = std::uint32_t{20}; - auto in_left = make_device_mdarray( - res, extents{}); - auto in_right = make_device_mdarray( - res, extents{}); - auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; - - for (auto i = std::uint32_t{}; i < depth; ++i) { - for (auto j = std::uint32_t{}; j < rows; ++j) { - for (auto k = std::uint32_t{}; k < cols; ++k) { - in_left(i, j, k) = gen_unique_entry(i, j, k); - in_right(i, j, k) = gen_unique_entry(i, j, k); - } - } - } - - auto out_left = make_device_mdarray( res, extents{}); auto out_right = -make_device_mdarray( res, -extents{}); - - // Custom kernel - copy(res, out_right.view(), in_right.view()); - for (auto i = std::uint32_t{}; i < depth; ++i) { - for (auto j = std::uint32_t{}; j < rows; ++j) { - for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); - } - } - } - - // Custom kernel - copy(res, out_right.view(), in_left.view()); - for (auto i = std::uint32_t{}; i < depth; ++i) { - for (auto j = std::uint32_t{}; j < rows; ++j) { - for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); - } - } - } - - // Custom kernel - copy(res, out_left.view(), in_right.view()); - for (auto i = std::uint32_t{}; i < depth; ++i) { - for (auto j = std::uint32_t{}; j < rows; ++j) { - for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); - } - } - } - - // Custom kernel - copy(res, out_left.view(), in_left.view()); - for (auto i = std::uint32_t{}; i < depth; ++i) { - for (auto j = std::uint32_t{}; j < rows; ++j) { - for (auto k = std::uint32_t{}; k < cols; ++k) { - ASSERT_TRUE(match( - out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); - } - } - } -} */ +#endif } // namespace raft diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu index 78a128ee6e..f0a22eabe8 100644 --- a/cpp/test/core/mdspan_copy.cu +++ b/cpp/test/core/mdspan_copy.cu @@ -21,7 +21,6 @@ #include #include #include -#include namespace raft { TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) @@ -168,10 +167,8 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) auto out_long = make_host_mdarray( res, extents{}); - RAFT_LOG_WARN("BEGIN dtype conversion without transpose"); copy(res, out_long.view(), in_left.view()); res.sync_stream(); - RAFT_LOG_WARN("END dtype conversion without transpose"); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { for (auto k = std::uint32_t{}; k < cols; ++k) { From 92046e04a0b0b838e3127c8c59a820479803f80b Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 12 Sep 2023 14:08:53 -0400 Subject: [PATCH 58/75] Fix style and revert some unnecessary changes --- cpp/include/raft/core/cuda_support.hpp | 4 ++-- cpp/include/raft/core/host_container_policy.hpp | 17 +++++++++++++---- cpp/include/raft/core/memory_type.hpp | 6 +++--- cpp/include/raft/core/stream_view.hpp | 7 +++---- cpp/test/core/stream_view.cpp | 8 ++++---- 5 files changed, 25 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/core/cuda_support.hpp b/cpp/include/raft/core/cuda_support.hpp index 2f7730a1cd..07fb95a921 100644 --- a/cpp/include/raft/core/cuda_support.hpp +++ b/cpp/include/raft/core/cuda_support.hpp @@ -16,8 +16,8 @@ #pragma once namespace raft { #ifndef RAFT_DISABLE_CUDA - auto constexpr static const CUDA_ENABLED = true; +auto constexpr static const CUDA_ENABLED = true; #else - auto constexpr static const CUDA_ENABLED = false; +auto constexpr static const CUDA_ENABLED = false; #endif } // namespace raft diff --git a/cpp/include/raft/core/host_container_policy.hpp b/cpp/include/raft/core/host_container_policy.hpp index bbf050fab6..97d3c24d89 100644 --- a/cpp/include/raft/core/host_container_policy.hpp +++ b/cpp/include/raft/core/host_container_policy.hpp @@ -76,8 +76,10 @@ class host_vector_policy { */ template struct pinned_vector_policy { - using element_type = ElementType; - using allocator_type = thrust::mr::stateless_resource_allocator; + using element_type = ElementType; + using allocator_type = + thrust::mr::stateless_resource_allocator; using container_type = thrust::host_vector; using pointer = typename container_type::pointer; using const_pointer = typename container_type::const_pointer; @@ -86,9 +88,15 @@ struct pinned_vector_policy { using accessor_policy = std::experimental::default_accessor; using const_accessor_policy = std::experimental::default_accessor; - auto create(raft::resources const&, size_t n) -> container_type { return container_type(n, allocator_); } + auto create(raft::resources const&, size_t n) -> container_type + { + return container_type(n, allocator_); + } - constexpr pinned_vector_policy() noexcept(std::is_nothrow_default_constructible_v) : mr_{}, allocator_{&mr_} {} + constexpr pinned_vector_policy() noexcept(std::is_nothrow_default_constructible_v) + : mr_{}, allocator_{&mr_} + { + } [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference { @@ -102,6 +110,7 @@ struct pinned_vector_policy { [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + private: thrust::system::cuda::universal_host_pinned_memory_resource mr_; allocator_type allocator_; diff --git a/cpp/include/raft/core/memory_type.hpp b/cpp/include/raft/core/memory_type.hpp index 80c10991fb..961a5e35e6 100644 --- a/cpp/include/raft/core/memory_type.hpp +++ b/cpp/include/raft/core/memory_type.hpp @@ -18,10 +18,10 @@ namespace raft { enum class memory_type : std::uint8_t { - host = std::uint8_t{0}, - device = std::uint8_t{1}, + host = std::uint8_t{0}, + device = std::uint8_t{1}, managed = std::uint8_t{2}, - pinned = std::uint8_t{3} + pinned = std::uint8_t{3} }; auto constexpr is_device_accessible(memory_type mem_type) diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp index 1bf8fde6c1..f7e7934dbf 100644 --- a/cpp/include/raft/core/stream_view.hpp +++ b/cpp/include/raft/core/stream_view.hpp @@ -60,7 +60,8 @@ struct stream_view { using underlying_view_type = detail::fail_stream_view; #endif - constexpr stream_view(underlying_view_type base_view = stream_view::get_underlying_per_thread_default()) + constexpr stream_view( + underlying_view_type base_view = stream_view::get_underlying_per_thread_default()) : base_view_{base_view} { } @@ -86,9 +87,7 @@ struct stream_view { auto underlying() { return base_view_; } void synchronize_if_cuda_enabled() { - if constexpr (raft::CUDA_ENABLED) { - base_view_.synchronize(); - } + if constexpr (raft::CUDA_ENABLED) { base_view_.synchronize(); } } private: diff --git a/cpp/test/core/stream_view.cpp b/cpp/test/core/stream_view.cpp index 895ac18c79..715c53fe21 100644 --- a/cpp/test/core/stream_view.cpp +++ b/cpp/test/core/stream_view.cpp @@ -21,7 +21,8 @@ #include #endif namespace raft { -TEST(StreamView, Default) { +TEST(StreamView, Default) +{ auto stream = stream_view_per_thread; ASSERT_EQ(stream.is_per_thread_default(), raft::CUDA_ENABLED); ASSERT_FALSE(stream.is_default()); @@ -35,9 +36,8 @@ TEST(StreamView, Default) { EXPECT_NO_THROW(stream.synchronize_no_throw()); EXPECT_NO_THROW(stream.synchronize_if_cuda_enabled()); #ifndef RAFT_DISABLE_CUDA - static_assert( - std::is_same_v, "underlying should return rmm::cuda_stream_view" - ); + static_assert(std::is_same_v, + "underlying should return rmm::cuda_stream_view"); #endif } } // namespace raft From a0a5b69e1127d1bade431b94888756e99e7b17a8 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 12 Sep 2023 14:12:41 -0400 Subject: [PATCH 59/75] Remove changes related to mdbuffer --- .../raft/core/device_container_policy.hpp | 66 ------------------- .../raft/core/host_container_policy.hpp | 57 ---------------- 2 files changed, 123 deletions(-) diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp index b24cab9e3d..011de307db 100644 --- a/cpp/include/raft/core/device_container_policy.hpp +++ b/cpp/include/raft/core/device_container_policy.hpp @@ -21,7 +21,6 @@ * limitations under the License. */ #pragma once -#ifndef RAFT_DISABLE_CUDA #include #include @@ -33,7 +32,6 @@ #include #include #include -#include #include @@ -197,68 +195,4 @@ class device_uvector_policy { rmm::mr::device_memory_resource* mr_{nullptr}; }; -/** - * @brief A container policy for managed mdarray. - */ -template -class managed_uvector_policy { - public: - using element_type = ElementType; - using container_type = device_uvector; - // FIXME(jiamingy): allocator type is not supported by rmm::device_uvector - using pointer = typename container_type::pointer; - using const_pointer = typename container_type::const_pointer; - using reference = device_reference; - using const_reference = device_reference; - - using accessor_policy = std::experimental::default_accessor; - using const_accessor_policy = std::experimental::default_accessor; - - public: - auto create(raft::resources const& res, size_t n) -> container_type - { - return container_type(n, resource::get_cuda_stream(res), &mr_); - } - - managed_uvector_policy() = default; - - [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference - { - return c[n]; - } - [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept - -> const_reference - { - return c[n]; - } - - [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } - [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } - - private: - rmm::mr::managed_memory_resource mr_{}; -}; - -} // namespace raft -#else -#include -namespace raft { - -// Provide placeholders that will allow CPU-GPU interoperable codebases to -// compile in non-CUDA mode but which will throw exceptions at runtime on any -// attempt to touch device data - -template -using device_reference = detail::fail_reference; - -template -using device_uvector = detail::fail_container; - -template -using device_uvector_policy = detail::fail_container_policy; - -template -using managed_uvector_policy = detail::fail_container_policy; - } // namespace raft -#endif diff --git a/cpp/include/raft/core/host_container_policy.hpp b/cpp/include/raft/core/host_container_policy.hpp index 97d3c24d89..3b3538ea20 100644 --- a/cpp/include/raft/core/host_container_policy.hpp +++ b/cpp/include/raft/core/host_container_policy.hpp @@ -24,13 +24,6 @@ #include #include #include -#ifndef RAFT_DISABLE_CUDA -#include -#include -#include -#else -#include -#endif namespace raft { @@ -69,54 +62,4 @@ class host_vector_policy { [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } }; - -#ifndef RAFT_DISABLE_CUDA -/** - * @brief A container policy for pinned mdarray. - */ -template -struct pinned_vector_policy { - using element_type = ElementType; - using allocator_type = - thrust::mr::stateless_resource_allocator; - using container_type = thrust::host_vector; - using pointer = typename container_type::pointer; - using const_pointer = typename container_type::const_pointer; - using reference = element_type&; - using const_reference = element_type const&; - using accessor_policy = std::experimental::default_accessor; - using const_accessor_policy = std::experimental::default_accessor; - - auto create(raft::resources const&, size_t n) -> container_type - { - return container_type(n, allocator_); - } - - constexpr pinned_vector_policy() noexcept(std::is_nothrow_default_constructible_v) - : mr_{}, allocator_{&mr_} - { - } - - [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference - { - return c[n]; - } - [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept - -> const_reference - { - return c[n]; - } - - [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } - [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } - - private: - thrust::system::cuda::universal_host_pinned_memory_resource mr_; - allocator_type allocator_; -}; -#else -template -using pinned_vector_policy = detail::fail_container_policy; -#endif } // namespace raft From 58389ecbf7edbb3eab7e2b8294918a110546a70c Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 12 Sep 2023 19:06:38 -0400 Subject: [PATCH 60/75] Remove change related to mdbuffer --- cpp/include/raft/core/memory_type.hpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/cpp/include/raft/core/memory_type.hpp b/cpp/include/raft/core/memory_type.hpp index 961a5e35e6..cd37a0ee50 100644 --- a/cpp/include/raft/core/memory_type.hpp +++ b/cpp/include/raft/core/memory_type.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,9 @@ * limitations under the License. */ #pragma once -#include namespace raft { -enum class memory_type : std::uint8_t { - host = std::uint8_t{0}, - device = std::uint8_t{1}, - managed = std::uint8_t{2}, - pinned = std::uint8_t{3} -}; +enum class memory_type { host, device, managed, pinned }; auto constexpr is_device_accessible(memory_type mem_type) { From 0a19ae5dd9fa71ec37b497595ca032a6906d6850 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Tue, 12 Sep 2023 19:07:02 -0400 Subject: [PATCH 61/75] Correctly handle proxy references in mdspan copy kernel --- cpp/include/raft/core/detail/copy.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 4c65ea6027..339d2d597f 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -239,15 +239,15 @@ make_index_sequence{}); * indicated element. */ template -__device__ auto& get_mdspan_elem(MdspanType& md, - IdxType const* indices, - index_sequence) +__device__ decltype(auto) get_mdspan_elem(MdspanType md, + IdxType const* indices, + index_sequence) { return md(indices[Idx]...); } template -__device__ auto& get_mdspan_elem(MdspanType& md, IdxType const* indices) +__device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices) { return get_mdspan_elem( md, indices, make_index_sequence{}); From 06752076db4e68f8a07acc88c607fc160eee6a20 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 13 Sep 2023 14:18:13 -0400 Subject: [PATCH 62/75] Check for unique destination layout in any parallel copy --- cpp/include/raft/core/detail/copy.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 339d2d597f..d78564b44f 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -91,6 +91,13 @@ struct mdspan_copyable { auto static constexpr const same_layout = std::is_same_v; + auto static check_for_unique_dst(dst_type dst) + { + if constexpr (!dst_type::is_always_unique()) { + RAFT_EXPECTS(dst.is_unique(), "Destination mdspan must be unique for parallelized copies"); + } + } + auto static constexpr const src_contiguous = std::disjunction_v, std::is_same>; @@ -458,6 +465,7 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr #endif } else if constexpr (config::custom_kernel_allowed) { #ifdef __CUDACC__ + config::check_for_unique_dst(dst); auto const blocks = std::min( // This maximum is somewhat arbitrary. Could query the device to see // how many blocks we could reasonably allow, but this is probably From 8ad9434e3706bb6c778aa03ae04208cc42223b3b Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 13 Sep 2023 14:24:29 -0400 Subject: [PATCH 63/75] Use perfect forwarding for copy wrappers --- cpp/include/raft/core/copy.cuh | 8 ++++---- cpp/include/raft/core/copy.hpp | 4 ++-- cpp/include/raft/core/detail/copy.hpp | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh index 2e5b0f9a46..3514f01844 100644 --- a/cpp/include/raft/core/copy.cuh +++ b/cpp/include/raft/core/copy.cuh @@ -51,9 +51,9 @@ namespace raft { template detail::mdspan_copyable_with_kernel_t copy(resources const& res, DstType&& dst, - SrcType const& src) + SrcType&& src) { - detail::copy(res, dst, src); + detail::copy(res, std::forward(dst), std::forward(src)); } #ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED @@ -61,9 +61,9 @@ detail::mdspan_copyable_with_kernel_t copy(resources const& re template detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, DstType&& dst, - SrcType const& src) + SrcType&& src) { - detail::copy(res, dst, src); + detail::copy(res, std::forward(dst), std::forward(src)); } #endif } // namespace raft diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp index 4662ed5655..4dc96b394d 100644 --- a/cpp/include/raft/core/copy.hpp +++ b/cpp/include/raft/core/copy.hpp @@ -53,9 +53,9 @@ namespace raft { template detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, DstType&& dst, - SrcType const& src) + SrcType&& src) { - detail::copy(res, dst, src); + detail::copy(res, std::forward(dst), std::forward(src)); } #endif diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index d78564b44f..7444e5626a 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -378,7 +378,7 @@ __global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(Ds #endif template -mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType const& src) +mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType&& src) { using config = mdspan_copyable; for (auto i = std::size_t{}; i < config::src_rank; ++i) { From fdbc9ee35304f07264dce5f60b811d4fc53e00e0 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 13 Sep 2023 14:30:43 -0400 Subject: [PATCH 64/75] Correct comment for dimension iteration order --- cpp/include/raft/core/detail/copy.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 7444e5626a..448b830b36 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -278,7 +278,13 @@ __device__ auto increment_indices(IdxType* indices, #pragma unroll for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { - // Iterate through dimensions in order from slowest to fastest varying + // Iterate through dimensions in order from slowest to fastest varying for + // layout_right and layout_left. Otherwise, just iterate through dimensions + // in order. + // + // TODO(wphicks): It is possible to always iterate through dimensions in + // the slowest to fastest order. Consider this or at minimum expanding to + // padded layouts. auto const real_index = [](auto ind) { if constexpr (std::is_same_v) { return MdspanType::rank() - ind - 1; From 21618eafd76b4fcc037bd12b17081ed572e14b84 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Thu, 14 Sep 2023 09:38:27 -0400 Subject: [PATCH 65/75] Add warning about copying to non-unique layouts --- cpp/include/raft/core/copy.cuh | 9 +++++++-- cpp/include/raft/core/copy.hpp | 7 +++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh index 3514f01844..2e779d7b1a 100644 --- a/cpp/include/raft/core/copy.cuh +++ b/cpp/include/raft/core/copy.cuh @@ -34,13 +34,18 @@ namespace raft { * Depending on the specialization, this CUDA header may invoke the kernel and * therefore require a CUDA compiler. * - * - * * Limitations: Currently this function does not support copying directly * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the * underlying memory layout are currently not performant, although they are supported. * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * * @tparam DstType An mdspan type for the destination container. * @tparam SrcType An mdspan type for the source container * @param res raft::resources used to provide a stream for copies involving the diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp index 4dc96b394d..cdfb8dbe4d 100644 --- a/cpp/include/raft/core/copy.hpp +++ b/cpp/include/raft/core/copy.hpp @@ -43,6 +43,13 @@ namespace raft { * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the * underlying memory layout are currently not performant, although they are supported. * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * * @tparam DstType An mdspan type for the destination container. * @tparam SrcType An mdspan type for the source container * @param res raft::resources used to provide a stream for copies involving the From 18d462ef3ffdffc6bf16805b74cb204d09723460 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 18 Sep 2023 21:45:00 -0400 Subject: [PATCH 66/75] Add benchmarks for mdspan copy --- cpp/bench/prims/CMakeLists.txt | 3 + cpp/bench/prims/core/copy.cu | 401 ++++++++++++++++++++++++++ cpp/include/raft/core/detail/copy.hpp | 4 +- 3 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 cpp/bench/prims/core/copy.cu diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index e8d4739384..fce535d258 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -32,6 +32,7 @@ function(ConfigureBench) PRIVATE raft::raft raft_internal $<$:raft::compiled> + ${RAFT_CTK_MATH_DEPENDENCIES} benchmark::benchmark Threads::Threads $ @@ -73,6 +74,8 @@ function(ConfigureBench) endfunction() if(BUILD_PRIMS_BENCH) + ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/copy.cu bench/prims/main.cpp) + ConfigureBench( NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu new file mode 100644 index 0000000000..31ee83b924 --- /dev/null +++ b/cpp/bench/prims/core/copy.cu @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::core { + +template +auto constexpr const default_dims = []() { + auto dims = std::array{}; + std::fill(dims.begin(), dims.end(), 2); + return dims; +}(); + +template +auto constexpr const default_dims = std::array{3000000}; + +template +auto constexpr const default_dims = std::array{1000, 3000}; + +template +auto constexpr const default_dims = std::array{20, 300, 500}; + +template > +struct bench_array_type; + +template +struct bench_array_type> { + template + auto static constexpr const extent_type = raft::dynamic_extent; + + using type = + std::conditional_t...>, LayoutPolicy>, + device_mdarray...>, LayoutPolicy>>; +}; + +template +struct params { + std::array dims = default_dims; + using src_array_type = + typename bench_array_type::type; + using dst_array_type = + typename bench_array_type::type; +}; + +template +struct CopyBench : public fixture { + using params_type = + params; + using src_array_type = typename params_type::src_array_type; + using dst_array_type = typename params_type::dst_array_type; + explicit CopyBench(const params_type& ps) + : fixture{true}, + res_{}, + params_{ps}, + src_{ + res_, + typename src_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename src_array_type::container_policy_type{}, + }, + dst_{ + res_, + typename dst_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename dst_array_type::container_policy_type{}, + } + { + res_.get_cublas_handle(); // initialize cublas handle + auto src_data = std::vector(src_.size()); + std::iota(src_data.begin(), src_data.end(), SrcT{}); + raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream()); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); }); + } + + private: + raft::device_resources res_; + params_type params_; + src_array_type src_; + dst_array_type dst_; +}; + +template +auto static const inputs = std::vector{ParamsT{}}; + +#define COPY_REGISTER(BenchT) \ + RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs) + +using copy_bench_device_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_3d_diff_dtype_same_layout = CopyBench; + +using copy_bench_host_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench; +using copy_bench_host_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_host_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_device_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_device_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_host_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_host_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_3d_diff_dtype_same_layout = CopyBench; + +// COPY_REGISTER(copy_bench_same_dtype_1d_host_host); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout); + +} // namespace raft::bench::core diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 448b830b36..23d43f9217 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -264,12 +264,12 @@ __device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices) * by increment. Store the result in indices. Return true if the new * indices are valid for the input mdspan. */ -template +template __device__ auto increment_indices(IdxType* indices, MdspanType const& md, IdxType const* old_indices, IdxType const* index_strides, - IdxType increment) + IncrType increment) { #pragma unroll for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { From 6e91a1c125df0b12d9bd99f1e66833c8cf8daff3 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 20 Sep 2023 18:00:45 -0400 Subject: [PATCH 67/75] Correct check for assignability in mdspan copy --- cpp/include/raft/core/detail/copy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 23d43f9217..3976d72e97 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -67,7 +67,7 @@ struct mdspan_copyable { using src_element_type = typename src_type::element_type; auto static constexpr const same_dtype = std::is_same_v; auto static constexpr const compatible_dtype = - std::is_convertible_v; + std::is_assignable_v; auto static constexpr const dst_float = std::is_same_v; auto static constexpr const src_float = std::is_same_v; From 55e06fe4abccb8b3d217c78637e9140fd1e82267 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 20 Sep 2023 18:03:38 -0400 Subject: [PATCH 68/75] Add comment explaining intermediate storage --- cpp/include/raft/core/detail/copy.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 3976d72e97..1c59e7fa60 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -152,6 +152,9 @@ struct mdspan_copyable { std::bool_constant, std::bool_constant>; + // Do we need intermediate storage on device in order to perform + // non-trivial layout or dtype conversions after copying source from host or + // before copying converted results back to host? auto static constexpr const requires_intermediate = !both_host_accessible && !both_device_accessible && !can_use_raft_copy; From faa402a8d868dc84e480ec81aa011375599d1274 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Thu, 21 Sep 2023 11:25:57 -0400 Subject: [PATCH 69/75] Correct dtype compatibility test --- cpp/include/raft/core/detail/copy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 1c59e7fa60..c9fc04a01f 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -67,7 +67,7 @@ struct mdspan_copyable { using src_element_type = typename src_type::element_type; auto static constexpr const same_dtype = std::is_same_v; auto static constexpr const compatible_dtype = - std::is_assignable_v; + std::is_assignable_v; auto static constexpr const dst_float = std::is_same_v; auto static constexpr const src_float = std::is_same_v; From 2eba34d3cada75b8390c4dba90a6477e7dcff923 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Thu, 21 Sep 2023 18:58:15 -0400 Subject: [PATCH 70/75] Provide cleaner compile error for using copy with unsupported types --- cpp/include/raft/core/copy.cuh | 6 +++--- cpp/include/raft/core/copy.hpp | 6 +++--- cpp/include/raft/core/detail/copy.hpp | 28 ++++++++++++++++++++------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh index 2e779d7b1a..f256f9ea0f 100644 --- a/cpp/include/raft/core/copy.cuh +++ b/cpp/include/raft/core/copy.cuh @@ -64,9 +64,9 @@ detail::mdspan_copyable_with_kernel_t copy(resources const& re #ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED #define RAFT_NON_CUDA_COPY_IMPLEMENTED template -detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, - DstType&& dst, - SrcType&& src) +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) { detail::copy(res, std::forward(dst), std::forward(src)); } diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp index cdfb8dbe4d..0a16b742a2 100644 --- a/cpp/include/raft/core/copy.hpp +++ b/cpp/include/raft/core/copy.hpp @@ -58,9 +58,9 @@ namespace raft { * @param src The source mdspan. */ template -detail::mdspan_uncopyable_with_kernel_t copy(resources const& res, - DstType&& dst, - SrcType&& src) +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) { detail::copy(res, std::forward(dst), std::forward(src)); } diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index c9fc04a01f..5457a08df3 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -39,15 +39,28 @@ namespace raft { namespace detail { -template -struct mdspan_copyable {}; +template +struct mdspan_copyable : std::false_type { + auto static constexpr const custom_kernel_allowed = false; + auto static constexpr const custom_kernel_not_allowed = false; +}; /* * A helper struct used to determine whether one mdspan type can be copied to * another and if so how */ template -struct mdspan_copyable { +struct mdspan_copyable>>, + std::bool_constant>>>>> { using dst_type = std::remove_reference_t; using src_type = std::remove_reference_t; @@ -183,6 +196,7 @@ struct mdspan_copyable { std::conjunction_v, std::bool_constant>; + auto static constexpr const custom_kernel_not_allowed = !custom_kernel_allowed; auto static constexpr const custom_kernel_required = std::conjunction_v, std::bool_constant>; @@ -205,16 +219,16 @@ template auto static constexpr const mdspan_copyable_with_kernel_v = mdspan_copyable::custom_kernel_allowed; template -auto static constexpr const mdspan_uncopyable_with_kernel_v = - !mdspan_copyable::custom_kernel_allowed; +auto static constexpr const mdspan_copyable_not_with_kernel_v = + mdspan_copyable::custom_kernel_not_allowed; template using mdspan_copyable_with_kernel_t = std::enable_if_t, T>; template -using mdspan_uncopyable_with_kernel_t = - std::enable_if_t, T>; +using mdspan_copyable_not_with_kernel_t = + std::enable_if_t, T>; #ifdef __CUDACC__ auto static constexpr const mdspan_copy_tile_dim = 32; From 4389b6465e6b2527f0e110a9319be31ccbed52eb Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 22 Sep 2023 10:17:36 -0400 Subject: [PATCH 71/75] Update stream_view docs --- cpp/include/raft/core/resource/stream_view.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp index ed7129b622..326e134ff0 100644 --- a/cpp/include/raft/core/resource/stream_view.hpp +++ b/cpp/include/raft/core/resource/stream_view.hpp @@ -68,10 +68,10 @@ inline raft::stream_view get_stream_view(resources const& res) }; /** - * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res + * Load a raft::stream__view from a resources instance (and populate it on the res * if needed). * @param[in] res raft resources object for managing resources - * @param[in] stream_view cuda stream view + * @param[in] stream_view raft stream view */ inline void set_stream_view(resources const& res, raft::stream_view view) { From 62ac60abf37e346ca516d0a05337f0857036c6c7 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 22 Sep 2023 15:11:06 -0400 Subject: [PATCH 72/75] Update stream view docs --- cpp/include/raft/core/resource/stream_view.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp index 326e134ff0..ccf516076f 100644 --- a/cpp/include/raft/core/resource/stream_view.hpp +++ b/cpp/include/raft/core/resource/stream_view.hpp @@ -71,7 +71,7 @@ inline raft::stream_view get_stream_view(resources const& res) * Load a raft::stream__view from a resources instance (and populate it on the res * if needed). * @param[in] res raft resources object for managing resources - * @param[in] stream_view raft stream view + * @param[in] view raft stream view */ inline void set_stream_view(resources const& res, raft::stream_view view) { From a8b17a85ea41e44120a967980d4a788439f27bbc Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 2 Oct 2023 11:29:23 -0400 Subject: [PATCH 73/75] Add static asserts for mdspan_copyable --- cpp/test/core/mdspan_copy.cpp | 56 +++++++++++++++++++++++++---------- cpp/test/core/mdspan_copy.cu | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 15 deletions(-) diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp index b64ad0355b..2f938e3035 100644 --- a/cpp/test/core/mdspan_copy.cpp +++ b/cpp/test/core/mdspan_copy.cpp @@ -38,7 +38,10 @@ TEST(MDSpanCopy, Mdspan1DHostHost) } auto out_right = make_host_vector(res, cols); - // std::copy + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); copy(res, out_right.view(), in_left.view()); for (auto i = std::uint32_t{}; i < cols; ++i) { ASSERT_TRUE(match(out_right(i), double(gen_unique_entry(i)), CompareApprox{0.0001})); @@ -57,8 +60,11 @@ TEST(MDSpanCopy, Mdspan1DHostDevice) in_left(i) = gen_unique_entry(i); } - // raft::copy auto out_right = make_device_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < cols; ++i) { @@ -78,8 +84,11 @@ TEST(MDSpanCopy, Mdspan1DDeviceHost) in_left(i) = gen_unique_entry(i); } - // raft::copy auto out_right = make_host_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < cols; ++i) { @@ -95,9 +104,9 @@ TEST(MDSpanCopy, Mdspan3DHostHost) auto constexpr depth = std::uint32_t{500}; auto constexpr rows = std::uint32_t{300}; auto constexpr cols = std::uint32_t{200}; - auto in_left = make_host_mdarray( + auto in_left = make_host_mdarray( res, extents{}); - auto in_right = make_host_mdarray( + auto in_right = make_host_mdarray( res, extents{}); auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; @@ -112,10 +121,13 @@ TEST(MDSpanCopy, Mdspan3DHostHost) auto out_left = make_host_mdarray( res, extents{}); - auto out_right = make_host_mdarray( + auto out_right = make_host_mdarray( res, extents{}); - // std::copy + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); copy(res, out_right.view(), in_right.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -126,7 +138,6 @@ TEST(MDSpanCopy, Mdspan3DHostHost) } } - // simd or custom logic copy(res, out_right.view(), in_left.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -137,7 +148,6 @@ TEST(MDSpanCopy, Mdspan3DHostHost) } } - // simd or custom logic copy(res, out_left.view(), in_right.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -148,7 +158,9 @@ TEST(MDSpanCopy, Mdspan3DHostHost) } } - // std::copy + static_assert(detail::mdspan_copyable:: + can_use_std_copy, + "Current implementation should use std::copy for this copy"); copy(res, out_left.view(), in_left.view()); for (auto i = std::uint32_t{}; i < depth; ++i) { for (auto j = std::uint32_t{}; j < rows; ++j) { @@ -190,7 +202,10 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) make_device_mdarray( res, extents{}); - // raft::copy + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); copy(res, out_right.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -203,7 +218,9 @@ TEST(MDSpanCopy, Mdspan3DHostDevice) } } - // raft::copy + static_assert(detail::mdspan_copyable:: + can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); copy(res, out_left.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -240,7 +257,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) auto out_right = make_device_mdarray( res, extents{}); - // raft::copy + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); copy(res, out_right.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -250,7 +270,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) } } - // cublas + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -260,7 +283,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDevice) } } - // cublas + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu index f0a22eabe8..95d7d3befd 100644 --- a/cpp/test/core/mdspan_copy.cu +++ b/cpp/test/core/mdspan_copy.cu @@ -50,6 +50,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) auto out_long = make_device_mdarray( res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_long.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -66,6 +69,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) auto out_right = make_device_mdarray( res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -76,6 +82,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) } } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -113,6 +122,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) res.sync_stream(); // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -123,6 +135,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) } // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -131,6 +146,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -167,6 +185,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) auto out_long = make_host_mdarray( res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_long.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -183,6 +204,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) auto out_right = make_host_mdarray( res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -193,6 +217,9 @@ TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) } } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -230,6 +257,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) res.sync_stream(); // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -240,6 +270,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) } // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -248,6 +281,9 @@ TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -285,6 +321,9 @@ TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) auto out_long = make_device_mdarray( res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_long.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -301,6 +340,9 @@ TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) auto out_right = make_device_mdarray( res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -311,6 +353,9 @@ TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) } } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < depth; ++i) { @@ -348,6 +393,9 @@ TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) res.sync_stream(); // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -358,6 +406,9 @@ TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) } // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_right.view(), in_left.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { @@ -366,6 +417,9 @@ TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); } } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); copy(res, out_left.view(), in_right.view()); res.sync_stream(); for (auto i = std::uint32_t{}; i < rows; ++i) { From 722425ca6e1b332e612ef05deae5eea2381357e0 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Mon, 2 Oct 2023 13:24:28 -0400 Subject: [PATCH 74/75] Correct iteration in host-to-host copies --- cpp/include/raft/core/detail/copy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index 5457a08df3..b23660fefe 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -527,7 +527,7 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr // cache-oblivious implementation should work through dimensions in // order of increasing stride. auto dim = std::size_t{}; - while ((indices[dim]++) == src.extent(dim)) { + while ((++indices[dim]) == src.extent(dim)) { indices[dim] = typename config::index_type{}; ++dim; } From 0863db07a505bb28a4e14147c129b245954b1067 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Wed, 4 Oct 2023 15:13:58 -0400 Subject: [PATCH 75/75] Fix double-defined target from branch merge --- cpp/bench/prims/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index 9eb58adf80..5da2cd916b 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -74,13 +74,14 @@ function(ConfigureBench) endfunction() if(BUILD_PRIMS_BENCH) - ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/copy.cu bench/prims/main.cpp) + ConfigureBench( + NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp + ) ConfigureBench( NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) - ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp) ConfigureBench( NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu