From 7e328ff8eed6410fea3c79db8bfca6dd07050c66 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 13 Mar 2023 20:54:27 -0400 Subject: [PATCH] Revert "Generic linalg::map (#1329)" (#1336) The change to the public API was siginficant enough here that it's going to require updates on the cuml side which invoke `map_k` with very different arguments (e.g. mdspan, different order). For now, it's best we revert this commmit to unblock cuml and then we can proceed by keeping the old APIs and deprecating them until we change cuml. Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/raft/pull/1336 --- cpp/include/raft/linalg/binary_op.cuh | 26 +- cpp/include/raft/linalg/detail/binary_op.cuh | 98 ++++++ cpp/include/raft/linalg/detail/init.hpp | 54 ++++ cpp/include/raft/linalg/detail/map.cuh | 224 ++----------- .../raft/linalg/detail/strided_reduction.cuh | 3 +- cpp/include/raft/linalg/detail/ternary_op.cuh | 105 ++++++ cpp/include/raft/linalg/detail/unary_op.cuh | 99 ++++++ cpp/include/raft/linalg/init.cuh | 17 +- cpp/include/raft/linalg/map.cuh | 306 +++++------------- cpp/include/raft/linalg/ternary_op.cuh | 36 ++- cpp/include/raft/linalg/unary_op.cuh | 39 ++- cpp/include/raft/matrix/init.cuh | 5 +- .../raft/neighbors/detail/ivf_pq_search.cuh | 30 +- cpp/test/linalg/map.cu | 8 +- 14 files changed, 580 insertions(+), 470 deletions(-) create mode 100644 cpp/include/raft/linalg/detail/binary_op.cuh create mode 100644 cpp/include/raft/linalg/detail/init.hpp create mode 100644 cpp/include/raft/linalg/detail/ternary_op.cuh create mode 100644 cpp/include/raft/linalg/detail/unary_op.cuh diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index ed083a1590..966e84965d 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -18,9 +18,12 @@ #pragma once +#include "detail/binary_op.cuh" + #include #include -#include +#include +#include namespace raft { namespace linalg { @@ -49,7 +52,7 @@ template (stream, out, len, op, in1, in2); + detail::binaryOp(out, in1, in2, len, op, stream); } /** @@ -77,7 +80,22 @@ template > void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op) { - return map(handle, in1, in2, out, op); + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); + RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(), + "Size mismatch between Output and Inputs"); + + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + binaryOp( + out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); + } else { + binaryOp( + out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); + } } /** @} */ // end of group binary_op @@ -85,4 +103,4 @@ void binary_op(raft::device_resources const& handle, InType in1, InType in2, Out }; // end namespace linalg }; // end namespace raft -#endif +#endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh new file mode 100644 index 0000000000..d073e164fd --- /dev/null +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void binaryOpKernel( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) +{ + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a, b; + OutVecType c; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + c.val.data[i] = op(a.val.data[i], b.val.data[i]); + } + c.store(out, idx); +} + +template +void binaryOpImpl( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + binaryOpKernel + <<>>(out, in1, in2, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * @brief Checks if addresses are aligned on N bytes + */ +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) +{ + return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; +} + +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t in1Addr = uint64_t(in1); + uint64_t in2Addr = uint64_t(in2); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (1 / maxSize) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else { + binaryOpImpl(out, in1, in2, len, op, stream); + } +} + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp new file mode 100644 index 0000000000..4718a2cb0e --- /dev/null +++ b/cpp/include/raft/linalg/detail/init.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void range(T* out, int start, int end, cudaStream_t stream) +{ + thrust::counting_iterator first(start); + thrust::counting_iterator last = first + (end - start); + thrust::device_ptr ptr(out); + thrust::copy(rmm::exec_policy(stream), first, last, ptr); +} + +/** + * @brief Like Python range. + * + * Fills the output as out[i] = i. + * + * \param [out] out device array, size [n] + * \param [in] n length of the array + * \param [in] stream cuda stream + */ +template +void range(T* out, int n, cudaStream_t stream) +{ + range(out, 0, n, stream); +} + +} // namespace detail +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 90b653b711..e0b473bdd4 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -16,207 +16,43 @@ #pragma once -#include +#include #include #include -#include -#include -#include #include -#include +namespace raft { +namespace linalg { +namespace detail { -namespace raft::linalg::detail { - -template -__device__ __forceinline__ auto map_apply(Func f, const IdxT& offset, const InTs&... ins) -> OutT -{ - if constexpr (PassOffset) { - return f(offset, ins...); - } else { - return f(ins...); - } -} - -template -__device__ __forceinline__ void map_kernel_mainloop( - OutT* out_ptr, IdxT offset, IdxT len, Func f, const InTs*... in_ptrs, std::index_sequence) -{ - TxN_t wide; - thrust::tuple...> wide_args; - if (offset + R <= len) { - (thrust::get(wide_args).load(in_ptrs, offset), ...); -#pragma unroll - for (int j = 0; j < R; ++j) { - wide.val.data[j] = map_apply( - f, offset + j, thrust::get(wide_args).val.data[j]...); - } - wide.store(out_ptr, offset); - } -} - -template -__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; - if constexpr (R <= 1) { - if (tid < len) { - out_ptr[tid] = map_apply(f, tid, in_ptrs[tid]...); - } - } else { - using align_bytes = Pow2; - using align_elems = Pow2; - - // how many elements to skip in order to do aligned vectorized store - const IdxT skip_cnt_left = std::min(IdxT(align_bytes::roundUp(out_ptr) - out_ptr), len); - - // The main loop: process all aligned data - map_kernel_mainloop( - out_ptr, tid * R + skip_cnt_left, len, f, in_ptrs..., std::index_sequence_for()); - - static_assert(WarpSize >= R); - // Processes the skipped elements on the left - if (tid < skip_cnt_left) { - out_ptr[tid] = map_apply(f, tid, in_ptrs[tid]...); - } - // Processes the skipped elements on the right - const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left); - const IdxT remain_i = len - skip_cnt_right + tid; - if (remain_i < len) { - out_ptr[remain_i] = - map_apply(f, remain_i, in_ptrs[remain_i]...); - } - } -} - -template -void map_call(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - const IdxT len_vectorized = raft::div_rounding_up_safe(len, R); - const int threads = - std::max(WarpSize, std::min(raft::bound_by_power_of_two(len_vectorized), 256)); - const IdxT blocks = raft::div_rounding_up_unsafe(len_vectorized, threads); - map_kernel<<>>(out_ptr, len, f, in_ptrs...); -} - -constexpr int kCoalescedVectorSize = 16; -constexpr int kSmallInputThreshold = 1024; - -struct ratio_selector { - int ratio; - int align; - constexpr inline ratio_selector(int r, int a) : ratio(r), align(a) {} - - template - constexpr static auto ignoring_alignment() -> ratio_selector - { - return ratio_selector{raft::div_rounding_up_safe(kCoalescedVectorSize, sizeof(T)), 0}; - } - - template - explicit ratio_selector(const T* ptr) - { - constexpr auto s = ignoring_alignment(); // NOLINT - align = int(Pow2::roundUp(ptr) - ptr); - ratio = int(s.ratio); - } -}; - -constexpr inline auto operator*(const ratio_selector& a, const ratio_selector& b) -> ratio_selector -{ - auto ratio = std::min(a.ratio, b.ratio); - while ((a.align % ratio) != (b.align % ratio)) { - ratio >>= 1; - } - return ratio_selector{ratio, a.align % ratio}; -} - -template -void map_call_rt( - int r, rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - if (r >= R) { return map_call(stream, out_ptr, len, f, in_ptrs...); } - if constexpr (R > 1) { - return map_call_rt<(R >> 1), PassOffset>(r, stream, out_ptr, len, f, in_ptrs...); - } -} - -template -void map(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - // don't vectorize on small inputs - if (len <= kSmallInputThreshold) { - return map_call<1, PassOffset>(stream, out_ptr, len, f, in_ptrs...); - } - constexpr int kRatio = - (ratio_selector::ignoring_alignment() * ... * ratio_selector::ignoring_alignment()) - .ratio; - static_assert(kRatio > 0, "Unexpected zero vector size."); - const int ratio = (ratio_selector(out_ptr) * ... * ratio_selector(in_ptrs)).ratio; - return map_call_rt(ratio, stream, out_ptr, len, f, in_ptrs...); -} - -template , - typename = raft::enable_if_input_device_mdspan> -void map_check_shape(OutType out, InType in) -{ - RAFT_EXPECTS(raft::is_row_or_column_major(in) && out.size() == in.size(), - "All inputs must be contiguous and have the same size as the output"); -} - -/** - * @brief Map a function over a zero or more inputs and optionally a 0-based flat index - * (element offset). - * - * _Performance note_: when possible, this function loads the argument arrays and stores the output - * array using vectorized cuda load/store instructions. The size of the vectorization depends on the - * size of the largest input/output element type and on the alignment of all pointers. - * - * @tparam PassOffset whether to pass an offset as a first argument to Func - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * @tparam InTypes data-types of the inputs (device_mdspan) - * - * @param[in] res raft::device_resources - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda of type - * ([auto offset], InTypes::value_type xs...) -> OutType::value_type - * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins) + typename IdxType, + typename MapOp, + int TPB, + typename... Args> +__global__ void mapKernel(OutType* out, IdxType len, MapOp map, const InType* in, Args... args) { - RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); - (map_check_shape(out, ins), ...); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (out.size() <= std::numeric_limits::max()) { - map( - res.get_stream(), out.data_handle(), uint32_t(out.size()), f, ins.data_handle()...); - } else { - map( - res.get_stream(), out.data_handle(), uint64_t(out.size()), f, ins.data_handle()...); - } + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -} // namespace raft::linalg::detail +template +void mapImpl( + OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + const int nblks = raft::ceildiv(len, (IdxType)TPB); + mapKernel + <<>>(out, len, map, in, args...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +} // namespace detail +} // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 42e79a9285..0e516b4750 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once +#include "unary_op.cuh" #include #include #include diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh new file mode 100644 index 0000000000..7874f20f56 --- /dev/null +++ b/cpp/include/raft/linalg/detail/ternary_op.cuh @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace linalg { +namespace detail { +template +__global__ void ternaryOpKernel( + out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op) +{ + typedef raft::TxN_t VecType; + VecType a, b, c; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= VecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); + c.load(in3, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) { + a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]); + } + a.store(out, idx); +} + +template +void ternaryOpImpl(out_t* out, + const math_t* in1, + const math_t* in2, + const math_t* in3, + IdxType len, + Lambda op, + cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB); + ternaryOpKernel + <<>>(out, in1, in2, in3, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * @brief perform element-wise ternary operation on the input arrays + * @tparam math_t data-type upon which the math operation will be performed + * @tparam Lambda the device-lambda performing the actual operation + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in1 the first input array + * @param in2 the second input array + * @param in3 the third input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + */ +template +void ternaryOp(out_t* out, + const math_t* in1, + const math_t* in2, + const math_t* in3, + IdxType len, + Lambda op, + cudaStream_t stream) +{ + size_t bytes = len * sizeof(math_t); + if (16 / sizeof(math_t) && bytes % 16 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (8 / sizeof(math_t) && bytes % 8 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (4 / sizeof(math_t) && bytes % 4 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (2 / sizeof(math_t) && bytes % 2 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (1 / sizeof(math_t)) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else { + ternaryOpImpl(out, in1, in2, in3, len, op, stream); + } +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh new file mode 100644 index 0000000000..cdadc6f868 --- /dev/null +++ b/cpp/include/raft/linalg/detail/unary_op.cuh @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) +{ + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a; + OutVecType b; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + b.val.data[i] = op(a.val.data[i]); + } + b.store(out, idx); +} + +template +void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + unaryOpKernel + <<>>(out, in, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (1 / maxSize) { + unaryOpImpl(out, in, len, op, stream); + } else { + unaryOpImpl(out, in, len, op, stream); + } +} + +template +__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) +{ + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + if (idx < len) { op(out + idx, idx); } +} + +template +void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + auto nblks = raft::ceildiv(len, TPB); + writeOnlyUnaryOpKernel<<>>(out, len, op); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh index ad772b3989..5a810bf2ba 100644 --- a/cpp/include/raft/linalg/init.cuh +++ b/cpp/include/raft/linalg/init.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,10 +18,11 @@ #pragma once -#include +#include "detail/init.hpp" #include -namespace raft::linalg { +namespace raft { +namespace linalg { /** * @brief Like Python range. @@ -36,8 +37,7 @@ namespace raft::linalg { template void range(T* out, int start, int end, cudaStream_t stream) { - return detail::map( - stream, out, end - start, compose_op{cast_op{}, add_const_op{start}}); + detail::range(out, start, end, stream); } /** @@ -52,7 +52,7 @@ void range(T* out, int start, int end, cudaStream_t stream) template void range(T* out, int n, cudaStream_t stream) { - return detail::map(stream, out, n, cast_op{}); + detail::range(out, n, stream); } /** @@ -68,6 +68,7 @@ void zero(T* out, int n, cudaStream_t stream) RAFT_CUDA_TRY(cudaMemsetAsync(static_cast(out), 0, n * sizeof(T), stream)); } -} // namespace raft::linalg +} // namespace linalg +} // namespace raft -#endif +#endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index 305a8aa3cc..2b9e6c80a0 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -22,267 +22,119 @@ #include #include +#include +#include -namespace raft::linalg { +namespace raft { +namespace linalg { /** - * @defgroup map Mapping ops - * @{ - */ - -/** - * @brief Map a function over zero or more input mdspans of the same size. - * - * The algorithm applied on `k` inputs can be described in a following pseudo-code: - * @code - * for (auto i: [0 ... out.size()]) { - * out[i] = f(in_0[i], in_1[i], ..., in_k[i]) - * } - * @endcode - * - * _Performance note_: when possible, this function loads the argument arrays and stores the output - * array using vectorized cuda load/store instructions. The size of the vectorization depends on the - * size of the largest input/output element type and on the alignment of all pointers. - * - * Usage example: - * @code{.cpp} - * #include - * #include - * #include - * #include - * - * auto input = raft::make_device_vector(res, n); - * ... fill input .. - * auto squares = raft::make_device_vector(res, n); - * raft::linalg::map_offset(res, squares.view(), raft::sq_op{}, input.view()); - * @endcode - * - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * @tparam InTypes data-types of the inputs (device_mdspan) - * - * @param[in] res raft::device_resources - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InTypes::value_type xs...) -> OutType::value_type - * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) + * @brief CUDA version of map + * @tparam InType data-type upon which the math operation will be performed + * @tparam MapOp the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Args additional parameters + * @tparam OutType data-type in which the result will be stored + * @param out the output of the map operation (assumed to be a device pointer) + * @param len number of elements in the input array + * @param map the device-lambda + * @param stream cuda-stream where to launch this kernel + * @param in the input array + * @param args additional input arrays */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins) +template +void map_k( + OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) { - return detail::map(res, out, f, ins...); + detail::mapImpl( + out, len, map, stream, in, args...); } /** - * @brief Map a function over one mdspan. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InType1::value_type x) -> OutType::value_type + * @defgroup map Mapping ops + * @{ */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, InType1 in1, OutType out, Func f) -{ - return detail::map(res, out, f, in1); -} /** - * @brief Map a function over two mdspans. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam InType2 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[in] in2 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InType1::value_type x1, InType2::value_type x2) -> OutType::value_type + * @brief CUDA version of map + * @tparam InType data-type for math operation of type raft::device_mdspan + * @tparam MapOp the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @tparam OutType data-type of result of type raft::device_mdspan + * @tparam Args additional parameters + * @param[in] handle raft::device_resources + * @param[in] in the input of type raft::device_mdspan + * @param[out] out the output of the map operation of type raft::device_mdspan + * @param[in] map the device-lambda + * @param[in] args additional input arrays */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f) + int TPB = 256, + typename... Args, + typename = raft::enable_if_input_device_mdspan, + typename = raft::enable_if_output_device_mdspan> +void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args) { - return detail::map(res, out, f, in1, in2); + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output is not exhaustive"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input is not exhaustive"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Input and Output"); + + if (out.size() <= std::numeric_limits::max()) { + map_k( + out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); + } else { + map_k( + out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); + } } /** - * @brief Map a function over three mdspans. - * - * @tparam InType1 data-type of the input 1 (device_mdspan) - * @tparam InType2 data-type of the input 2 (device_mdspan) - * @tparam InType3 data-type of the input 3 (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input 1 (the same size as the output) (device_mdspan) - * @param[in] in2 the input 2 (the same size as the output) (device_mdspan) - * @param[in] in3 the input 3 (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map( - const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f) -{ - return detail::map(res, out, f, in1, in2, in3); -} - -/** - * @brief Map a function over zero-based flat index (element offset) and zero or more inputs. - * - * The algorithm applied on `k` inputs can be described in a following pseudo-code: - * @code - * for (auto i: [0 ... out.size()]) { - * out[i] = f(i, in_0[i], in_1[i], ..., in_k[i]) - * } - * @endcode - * - * _Performance note_: when possible, this function loads the argument arrays and stores the output - * array using vectorized cuda load/store instructions. The size of the vectorization depends on the - * size of the largest input/output element type and on the alignment of all pointers. + * @brief Perform an element-wise unary operation on the input offset into the output array * * Usage example: * @code{.cpp} * #include - * #include + * #include * #include * #include - * + * ... + * raft::handle_t handle; * auto squares = raft::make_device_vector(handle, n); - * raft::linalg::map_offset(res, squares.view(), raft::sq_op{}); + * raft::linalg::map_offset(handle, squares.view(), raft::sq_op()); * @endcode * - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * @tparam InTypes data-types of the inputs (device_mdspan) - * - * @param[in] res raft::device_resources - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InTypes::value_type xs...) -> OutType::value_type - * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) + * @tparam OutType Output mdspan type + * @tparam MapOp The unary operation type with signature `OutT func(const IdxT& idx);` + * @param[in] handle The raft handle + * @param[out] out Output array + * @param[in] op The unary operation */ template , - typename = raft::enable_if_input_device_mdspan> -void map_offset(const raft::device_resources& res, OutType out, Func f, InTypes... ins) + typename MapOp, + typename = raft::enable_if_output_device_mdspan> +void map_offset(const raft::device_resources& handle, OutType out, MapOp op) { - return detail::map(res, out, f, ins...); -} + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); -/** - * @brief Map a function over zero-based flat index (element offset) and one mdspan. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InType1::value_type x) -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map_offset(const raft::device_resources& res, InType1 in1, OutType out, Func f) -{ - return detail::map(res, out, f, in1); -} + using out_value_t = typename OutType::value_type; -/** - * @brief Map a function over zero-based flat index (element offset) and two mdspans. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam InType2 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[in] in2 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InType1::value_type x1, InType2::value_type x2) -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map_offset(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f) -{ - return detail::map(res, out, f, in1, in2); -} - -/** - * @brief Map a function over zero-based flat index (element offset) and three mdspans. - * - * @tparam InType1 data-type of the input 1 (device_mdspan) - * @tparam InType2 data-type of the input 2 (device_mdspan) - * @tparam InType3 data-type of the input 3 (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input 1 (the same size as the output) (device_mdspan) - * @param[in] in2 the input 2 (the same size as the output) (device_mdspan) - * @param[in] in3 the input 3 (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) - * -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map_offset( - const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f) -{ - return detail::map(res, out, f, in1, in2, in3); + thrust::tabulate( + handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op); } /** @} */ // end of map -} // namespace raft::linalg +} // namespace linalg +}; // namespace raft #endif diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh index 1e347d69be..aa3859bc23 100644 --- a/cpp/include/raft/linalg/ternary_op.cuh +++ b/cpp/include/raft/linalg/ternary_op.cuh @@ -19,9 +19,11 @@ #pragma once +#include "detail/ternary_op.cuh" + #include #include -#include +#include namespace raft { namespace linalg { @@ -48,7 +50,7 @@ void ternaryOp(out_t* out, Lambda op, cudaStream_t stream) { - return detail::map(stream, out, len, op, in1, in2, in3); + detail::ternaryOp(out, in1, in2, in3, len, op, stream); } /** @@ -78,7 +80,33 @@ template ::max()) { + ternaryOp(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + in3.data_handle(), + out.size(), + op, + handle.get_stream()); + } else { + ternaryOp(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + in3.data_handle(), + out.size(), + op, + handle.get_stream()); + } } /** @} */ // end of group ternary_op @@ -86,4 +114,4 @@ void ternary_op( }; // end namespace linalg }; // end namespace raft -#endif +#endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 23f932d2f2..ce102adfd2 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -18,9 +18,11 @@ #pragma once +#include "detail/unary_op.cuh" + #include #include -#include +#include namespace raft { namespace linalg { @@ -46,7 +48,7 @@ template void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) { - return detail::map(stream, out, len, op, in); + detail::unaryOpCaller(out, in, len, op, stream); } /** @@ -69,11 +71,7 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_ template void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) { - return detail::map(stream, out, len, [op] __device__(IdxType offset) { - OutType r; - op(&r, offset); - return r; - }); + detail::writeOnlyUnaryOpCaller(out, len, op, stream); } /** @@ -99,7 +97,20 @@ template > void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op) { - return map(handle, in, out, op); + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + unaryOp( + out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); + } else { + unaryOp( + out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); + } } /** @@ -119,7 +130,17 @@ template > void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op) { - return writeOnlyUnaryOp(out.data_handle(), out.size(), op, handle.get_stream()); + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + writeOnlyUnaryOp( + out.data_handle(), out.size(), op, handle.get_stream()); + } else { + writeOnlyUnaryOp( + out.data_handle(), out.size(), op, handle.get_stream()); + } } /** @} */ // end of group unary_op diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh index ed2fb4d209..f597bbd1c6 100644 --- a/cpp/include/raft/matrix/init.cuh +++ b/cpp/include/raft/matrix/init.cuh @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -66,7 +65,9 @@ void fill(raft::device_resources const& handle, raft::device_mdspan inout, math_t scalar) { - linalg::map(handle, inout, raft::const_op{scalar}); + RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported"); + detail::setValue( + inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream()); } /** @} */ // end of group matrix_init diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 4b6e6f5e31..4c6ef0e30b 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -42,11 +42,11 @@ #include #include +#include +#include #include -#include - namespace raft::neighbors::ivf_pq::detail { /** @@ -1311,13 +1311,13 @@ void ivfpq_search_worker(raft::device_resources const& handle, // of a cluster by processing the cluster at the same time as much as // possible. index_list_sorted_buf.resize(n_queries * n_probes, stream); - auto index_list_buf = - make_device_mdarray(handle, mr, make_extents(n_queries * n_probes)); + rmm::device_uvector index_list_buf(n_queries * n_probes, stream, mr); rmm::device_uvector cluster_labels_out(n_queries * n_probes, stream, mr); - auto index_list = index_list_buf.data_handle(); + auto index_list = index_list_buf.data(); index_list_sorted = index_list_sorted_buf.data(); - - linalg::map_offset(handle, index_list_buf.view(), identity_op{}); + thrust::sequence(handle.get_thrust_policy(), + thrust::device_pointer_cast(index_list), + thrust::device_pointer_cast(index_list + n_queries * n_probes)); int begin_bit = 0; int end_bit = sizeof(uint32_t) * 8; @@ -1376,15 +1376,13 @@ void ivfpq_search_worker(raft::device_resources const& handle, topK); rmm::device_uvector device_lut(search_instance.device_lut_size, stream, mr); - std::optional> query_kths_buf{std::nullopt}; - float* query_kths = nullptr; + rmm::device_uvector query_kths(0, stream, mr); if (manage_local_topk) { - query_kths_buf.emplace( - make_device_mdarray(handle, mr, make_extents(n_queries))); - linalg::map(handle, - query_kths_buf->view(), - raft::const_op{dummy_block_sort_t::queue_t::kDummy}); - query_kths = query_kths_buf->data_handle(); + query_kths.resize(n_queries, stream); + thrust::fill_n(handle.get_thrust_policy(), + query_kths.data(), + n_queries, + float(dummy_block_sort_t::queue_t::kDummy)); } search_instance(stream, index.size(), @@ -1403,7 +1401,7 @@ void ivfpq_search_worker(raft::device_resources const& handle, chunk_index.data(), query, index_list_sorted, - query_kths, + query_kths.data(), device_lut.data(), distances_buf.data(), neighbors_ptr); diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 15b40808ee..5b52374789 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -37,15 +37,13 @@ void mapLaunch(OutType* out, raft::device_resources handle{stream}; auto out_view = raft::make_device_vector_view(out, len); auto in1_view = raft::make_device_vector_view(in1, len); - auto in2_view = raft::make_device_vector_view(in2, len); - auto in3_view = raft::make_device_vector_view(in3, len); map( handle, + in1_view, out_view, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - in1_view, - in2_view, - in3_view); + in2, + in3); } template