From 6018bf9178d164a1dbcfda25b6f22ef797e01fd6 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Thu, 15 Jul 2021 17:02:52 -0400 Subject: [PATCH 001/171] DOC v21.10 Updates --- CHANGELOG.md | 4 ++++ cpp/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c690a5059c..33d23198e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# raft 21.10.00 (Date TBD) + +Please see https://github.com/rapidsai/raft/releases/tag/v21.10.00a for the latest changes to this development branch. + # raft 21.08.00 (Date TBD) Please see https://github.com/rapidsai/raft/releases/tag/v21.08.00a for the latest changes to this development branch. diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1d60071d7e..eba8d75826 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -30,7 +30,7 @@ include(rapids-find) rapids_cuda_init_architectures(RAFT) -project(RAFT VERSION 21.08.00 LANGUAGES CXX CUDA) +project(RAFT VERSION 21.10.00 LANGUAGES CXX CUDA) ############################################################################## # - build type --------------------------------------------------------------- From 300fcd7186ebc413354abeeb4215e4417137b733 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 26 Jul 2021 16:46:08 -0400 Subject: [PATCH 002/171] Use the new RAPIDS.cmake to fetch rapids-cmake (#298) The original approach of using FetchContent naively has a subtle bug when multiple projects that use rapids-cmake are combined together inside as sibling projects. This bug causes any `include(rapids-*)` commands to fail, causing CMake errors. Bug using `RAPIDS.cmake` we can resolve this issue and remove the new complex logic from each consumer. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/298 --- cpp/CMakeLists.txt | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index eba8d75826..04eaf548ce 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -15,13 +15,9 @@ #============================================================================= cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -include(FetchContent) -FetchContent_Declare( - rapids-cmake - GIT_REPOSITORY https://github.com/rapidsai/rapids-cmake.git - GIT_TAG origin/branch-21.08 - ) -FetchContent_MakeAvailable(rapids-cmake) +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake + ${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) include(rapids-cmake) include(rapids-cpm) include(rapids-cuda) From 78eca24a0a1824ddcd896f08dcf820911245dcf6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 28 Jul 2021 12:36:16 -0500 Subject: [PATCH 003/171] Remove max version pin for dask & distributed on development branch (#303) This PR will remove max version pinning for dask & distributed for development purposes. ref: rapidsai/cudf#8881 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - https://github.com/jakirkham URL: https://github.com/rapidsai/raft/pull/303 --- ci/gpu/build.sh | 4 ++-- ci/local/old-gpubuild.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 8f7bb5efd9..1cb3a0b6f7 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -59,8 +59,8 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid # Install the master version of dask, distributed, and dask-ml gpuci_logger "Install the master version of dask and distributed" set -x -pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps -pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps +pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps +pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps set +x diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh index 657c354603..eb2667149d 100644 --- a/ci/local/old-gpubuild.sh +++ b/ci/local/old-gpubuild.sh @@ -81,8 +81,8 @@ fi # Install the master version of dask, distributed, and dask-ml set -x -pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps -pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps +pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps +pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps set +x From fc1e701742a4173ae5c6ac973b5097d0550be594 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 29 Jul 2021 04:14:38 +1000 Subject: [PATCH 004/171] Warnings are errors (#299) This PR fixes current RAFT C++/CUDA compilation warnings and turns on -Wall to treat warnings as errors. Fixes #225 Fixes #289 Authors: - Mark Harris (https://github.com/harrism) Approvers: - Brad Rees (https://github.com/BradReesWork) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/299 --- cpp/cmake/modules/ConfigureCUDA.cmake | 4 +-- .../raft/distance/pairwise_distance_base.cuh | 14 ++++---- cpp/include/raft/lap/lap_functions.cuh | 14 ++++---- cpp/include/raft/lap/lap_kernels.cuh | 11 +++--- cpp/include/raft/mr/allocator.hpp | 6 ++-- cpp/include/raft/mr/buffer_base.hpp | 5 ++- cpp/include/raft/mr/device/allocator.hpp | 6 ++-- cpp/include/raft/mr/host/allocator.hpp | 7 ++-- .../sparse/hierarchy/detail/agglomerative.cuh | 21 ++++++----- cpp/include/raft/sparse/op/sort.h | 15 +++----- cpp/test/eigen_solvers.cu | 14 ++++---- cpp/test/mst.cu | 35 ++++++++++--------- cpp/test/sparse/linkage.cu | 12 ++++--- cpp/test/spatial/knn.cu | 18 ++++++---- 14 files changed, 97 insertions(+), 85 deletions(-) diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake index e92632a238..29203e86be 100644 --- a/cpp/cmake/modules/ConfigureCUDA.cmake +++ b/cpp/cmake/modules/ConfigureCUDA.cmake @@ -26,8 +26,8 @@ endif() list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) # set warnings as errors -# list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call) -# list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations) +list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call) +list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations) # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking if(CUDA_ENABLE_LINEINFO) diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 43abc9eb65..e3ff9a7081 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -20,6 +20,8 @@ #include #include +#include + namespace raft { namespace distance { @@ -312,20 +314,20 @@ __global__ __launch_bounds__( } template -dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) { +dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) { const auto numSMs = raft::getMultiProcessorCount(); int numBlocksPerSm = 0; dim3 grid; CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocksPerSm, func, P::Nthreads, sMemSize)); - int minGridSize = numSMs * numBlocksPerSm; - int yChunks = raft::ceildiv(m, P::Mblk); - int xChunks = raft::ceildiv(n, P::Nblk); + std::size_t minGridSize = numSMs * numBlocksPerSm; + std::size_t yChunks = raft::ceildiv(m, P::Mblk); + std::size_t xChunks = raft::ceildiv(n, P::Nblk); grid.y = yChunks > minGridSize ? minGridSize : yChunks; grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; if (grid.x != 1) { - int i = 1; + std::size_t i = 1; while (grid.y * i < minGridSize) { i++; } @@ -336,4 +338,4 @@ dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) { } }; // namespace distance -}; // namespace raft \ No newline at end of file +}; // namespace raft diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 0079f50e82..7640f3f816 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * Copyright 2020 KETAN DATE & RAKESH NAGI * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -24,19 +24,17 @@ */ #pragma once -#include -#include -#include -#include -#include -#include #include "d_structs.h" #include #include +#include #include -#include +#include +#include + +#include namespace raft { namespace lap { diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh index 8c9012ed72..14ad877aa4 100644 --- a/cpp/include/raft/lap/lap_kernels.cuh +++ b/cpp/include/raft/lap/lap_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * Copyright 2020 KETAN DATE & RAKESH NAGI * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -24,18 +24,15 @@ */ #pragma once -#include -#include -#include -#include -#include -#include #include "d_structs.h" #include #include #include +#include + +#include namespace raft { namespace lap { namespace detail { diff --git a/cpp/include/raft/mr/allocator.hpp b/cpp/include/raft/mr/allocator.hpp index 707b71d468..08a4987c91 100644 --- a/cpp/include/raft/mr/allocator.hpp +++ b/cpp/include/raft/mr/allocator.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,9 @@ #pragma once -#include +#include + +#include namespace raft { namespace mr { diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 29e0d7cfcd..3a44175182 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -16,8 +16,11 @@ #pragma once -#include #include + +#include + +#include #include #include diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp index 889e1640db..3d1ce38c31 100644 --- a/cpp/include/raft/mr/device/allocator.hpp +++ b/cpp/include/raft/mr/device/allocator.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,12 @@ #pragma once -#include #include + #include +#include + namespace raft { namespace mr { namespace device { diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index 8af266d4f0..e5b3da24eb 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -16,12 +16,13 @@ #pragma once -#include - -#include #include #include +#include + +#include + namespace raft { namespace mr { namespace host { diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 1ac075489a..3cffa1c28a 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -20,6 +20,7 @@ #include #include #include + #include #include @@ -27,6 +28,8 @@ #include #include +#include + namespace raft { namespace hierarchy { @@ -97,8 +100,8 @@ class UnionFind { template void build_dendrogram_host(const handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *data, - size_t nnz, value_idx *children, value_t *out_delta, - value_idx *out_size) { + std::size_t nnz, value_idx *children, + value_t *out_delta, value_idx *out_size) { auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); @@ -120,7 +123,7 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, UnionFind U(nnz + 1); - for (value_idx i = 0; i < nnz; i++) { + for (std::size_t i = 0; i < nnz; i++) { value_idx a = mst_src_h[i]; value_idx b = mst_dst_h[i]; value_t delta = mst_weights_h[i]; @@ -167,7 +170,7 @@ __global__ void write_levels_kernel(const value_idx *children, */ template __global__ void inherit_labels(const value_idx *children, - const value_idx *levels, size_t n_leaves, + const value_idx *levels, std::size_t n_leaves, value_idx *labels, int cut_level, value_idx n_vertices) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; @@ -222,8 +225,8 @@ struct init_label_roots { */ template void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, - const value_idx *children, size_t n_clusters, - size_t n_leaves) { + const value_idx *children, + std::size_t n_clusters, std::size_t n_leaves) { auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); @@ -241,7 +244,7 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, * out for each of the children */ - size_t n_edges = (n_leaves - 1) * 2; + auto n_edges = (n_leaves - 1) * 2; thrust::device_ptr d_ptr = thrust::device_pointer_cast(children); @@ -250,7 +253,9 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, // Prevent potential infinite loop from labeling disconnected // connectivities graph. - RAFT_EXPECTS(n_vertices == (n_leaves - 1) * 2, + RAFT_EXPECTS(n_leaves > 0, "n_leaves must be positive"); + RAFT_EXPECTS(static_cast(n_vertices) == + static_cast((n_leaves - 1) * 2), "Multiple components found in MST or MST is invalid. " "Cannot find single-linkage solution."); diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index 9dbe2b67c5..09d5b568be 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -16,25 +16,22 @@ #pragma once -#include - #include #include +#include #include #include #include +#include #include #include +#include + #include -#include #include -#include - -#include -#include namespace raft { namespace sparse { @@ -106,8 +103,6 @@ void coo_sort(COO *const in, template void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, value_idx nnz, cudaStream_t stream) { - thrust::device_ptr t_rows = thrust::device_pointer_cast(rows); - thrust::device_ptr t_cols = thrust::device_pointer_cast(cols); thrust::device_ptr t_data = thrust::device_pointer_cast(data); auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); @@ -117,4 +112,4 @@ void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, } }; // namespace op }; // end NAMESPACE sparse -}; // end NAMESPACE raft \ No newline at end of file +}; // end NAMESPACE raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index e6ee09262e..328137f42d 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,12 +14,14 @@ * limitations under the License. */ +#include +#include + #include + +#include #include #include -#include - -#include namespace raft { @@ -37,8 +39,6 @@ TEST(Raft, EigenSolvers) { value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; - auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); @@ -53,7 +53,7 @@ TEST(Raft, EigenSolvers) { // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; - unsigned long long seed{100110021003}; + std::uint64_t seed{100110021003}; eigen_solver_config_t cfg{ neigvs, maxiter, restart_iter, tol, reorthog, seed}; diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index d7aa76500b..94f81cddb8 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,19 +14,20 @@ * limitations under the License. */ -#include - -#include -#include -#include -#include +#include "test_utils.h" #include #include - #include -#include "test_utils.h" +#include + +#include + +#include + +#include +#include template struct CSRHost { @@ -55,25 +56,25 @@ namespace mst { // Returns total weight of MST template weight_t prims(CSRHost &csr_h) { - auto n_vertices = csr_h.offsets.size() - 1; + std::size_t n_vertices = csr_h.offsets.size() - 1; bool active_vertex[n_vertices]; // bool mst_set[csr_h.n_edges]; weight_t curr_edge[n_vertices]; - for (auto i = 0; i < n_vertices; i++) { + for (std::size_t i = 0; i < n_vertices; i++) { active_vertex[i] = false; - curr_edge[i] = INT_MAX; + curr_edge[i] = static_cast(std::numeric_limits::max()); } curr_edge[0] = 0; // function to pick next min vertex-edge auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex, auto n_vertices) { - weight_t min = INT_MAX; - vertex_t min_vertex; + auto min = static_cast(std::numeric_limits::max()); + vertex_t min_vertex{}; - for (auto v = 0; v < n_vertices; v++) { + for (std::size_t v = 0; v < n_vertices; v++) { if (!active_vertex[v] && curr_edge[v] < min) { min = curr_edge[v]; min_vertex = v; @@ -84,7 +85,7 @@ weight_t prims(CSRHost &csr_h) { }; // iterate over n vertices - for (auto v = 0; v < n_vertices - 1; v++) { + for (std::size_t v = 0; v < n_vertices - 1; v++) { // pick min vertex-edge auto curr_v = min_vertex_edge(curr_edge, active_vertex, n_vertices); @@ -106,7 +107,7 @@ weight_t prims(CSRHost &csr_h) { // find sum of MST weight_t total_weight = 0; - for (auto v = 1; v < n_vertices; v++) { + for (std::size_t v = 1; v < n_vertices; v++) { total_weight += curr_edge[v]; } diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index ce567e4298..a157a17e30 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -14,19 +14,21 @@ * limitations under the License. */ -#include -#include -#include -#include +#include "../test_utils.h" +#include #include #include +#include #include #include #include + #include -#include "../test_utils.h" +#include + +#include namespace raft { diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 2b1ef89f7a..de6251d32d 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -14,13 +14,18 @@ * limitations under the License. */ -#include +#include "../test_utils.h" + #include -#include #include + #include + +#include + +#include +#include #include -#include "../test_utils.h" namespace raft { namespace spatial { @@ -37,8 +42,7 @@ __global__ void build_actual_output(int *output, int n_rows, int k, int element = threadIdx.x + blockDim.x * blockIdx.x; if (element >= n_rows * k) return; - int ind = (int)indices[element]; - output[element] = idx_labels[ind]; + output[element] = idx_labels[indices[element]]; } __global__ void build_expected_output(int *output, int n_rows, int k, @@ -101,8 +105,8 @@ class KNNTest : public ::testing::TestWithParam { k_ = params_.k; std::vector row_major_input; - for (int i = 0; i < params_.input.size(); ++i) { - for (int j = 0; j < params_.input[i].size(); ++j) { + for (std::size_t i = 0; i < params_.input.size(); ++i) { + for (std::size_t j = 0; j < params_.input[i].size(); ++j) { row_major_input.push_back(params_.input[i][j]); } } From 947e22f65c2fd44f33f0f27c738bd74a1a1a69b4 Mon Sep 17 00:00:00 2001 From: Dillon Cullinan Date: Thu, 29 Jul 2021 12:01:38 -0400 Subject: [PATCH 005/171] ENH Replace gpuci_conda_retry with gpuci_mamba_retry (#295) `mamba` was recently added to gpuCI build environment, testing usage and solvability with this PR which should speed up build times. Authors: - Dillon Cullinan (https://github.com/dillon-cullinan) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/raft/pull/295 --- ci/gpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 1cb3a0b6f7..a62354883b 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -39,13 +39,13 @@ env gpuci_logger "Check GPU usage" nvidia-smi -# temporary usage of gpuci_conda_retry install with packages listed here, looking into +# temporary usage of gpuci_mamba_retry install with packages listed here, looking into # using the repos yaml files for this gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids gpuci_logger "Installing packages needed for RAFT" -gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ +gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ "cudatoolkit=${CUDA_REL}" \ "cudf=${MINOR_VERSION}" \ "rmm=${MINOR_VERSION}" \ From e4d8a036a5e268ebe360eea1f0f249168e94ebfa Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 4 Aug 2021 11:48:43 +1000 Subject: [PATCH 006/171] Enable CUDA device code warnings as errors (#307) Adds `-Werror=all-warnings` NVCC flag to ensure all CUDA device code warnings are treated as errors. Only enabled on CUDA 11.2+ because CUDA 11.0 has PTXAS warnings that go away in newer CUDA versions. Missed this in #299. Authors: - Mark Harris (https://github.com/harrism) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/307 --- cpp/cmake/modules/ConfigureCUDA.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake index 29203e86be..3786910be0 100644 --- a/cpp/cmake/modules/ConfigureCUDA.cmake +++ b/cpp/cmake/modules/ConfigureCUDA.cmake @@ -26,7 +26,9 @@ endif() list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) # set warnings as errors -list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call) +if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0) + list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings) +endif() list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations) # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking From 78b67af6cb7a919a5400c9679169c366221cf283 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Aug 2021 08:01:50 +1000 Subject: [PATCH 007/171] Fix more warnings (#311) Warnings missed in #299... Authors: - Mark Harris (https://github.com/harrism) Approvers: - Divye Gala (https://github.com/divyegala) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/311 --- cpp/include/raft/distance/distance.cuh | 34 +++++++++++------------ cpp/include/raft/linalg/cublas_wrappers.h | 5 +--- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 1b39a6ec18..fc0d07773f 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -47,7 +47,7 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + cudaStream_t stream, bool isRowMajor, InType) { raft::distance::euclideanAlgo1(m, n, k, x, y, dist, false, (AccType *)workspace, worksize, @@ -61,7 +61,7 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + cudaStream_t stream, bool isRowMajor, InType) { raft::distance::euclideanAlgo1(m, n, k, x, y, dist, true, (AccType *)workspace, worksize, @@ -75,7 +75,7 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + cudaStream_t stream, bool isRowMajor, InType) { raft::distance::cosineAlgo1( m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream, isRowMajor); @@ -87,8 +87,8 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { raft::distance::euclideanAlgo2(m, n, k, x, y, dist, false, fin_op, stream, isRowMajor); @@ -100,8 +100,8 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { raft::distance::euclideanAlgo2(m, n, k, x, y, dist, true, fin_op, stream, isRowMajor); @@ -113,8 +113,8 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { raft::distance::l1Impl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } @@ -125,8 +125,8 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { raft::distance::chebyshevImpl(m, n, k, x, y, dist, fin_op, stream, isRowMajor); @@ -138,8 +138,8 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { raft::distance::hellingerImpl(m, n, k, x, y, dist, fin_op, stream, isRowMajor); @@ -151,8 +151,8 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType metric_arg) { raft::distance::minkowskiImpl(m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg); @@ -164,8 +164,8 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { raft::distance::canberraImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 7c79e6c91d..1be14a550d 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -86,10 +86,8 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { /** FIXME: temporary alias for cuML compatibility */ #define CUBLAS_CHECK(call) CUBLAS_TRY(call) -///@todo: enable this once we have logging enabled -#if 0 /** check for cublas runtime API errors but do not assert */ -define CUBLAS_CHECK_NO_THROW(call) \ +#define CUBLAS_CHECK_NO_THROW(call) \ do { \ cublasStatus_t err = call; \ if (err != CUBLAS_STATUS_SUCCESS) { \ @@ -97,7 +95,6 @@ define CUBLAS_CHECK_NO_THROW(call) \ raft::linalg::detail::cublas_error_to_string(err)); \ } \ } while (0) -#endif namespace raft { namespace linalg { From 3196480e2b41060b886e597e8ce07ce7d5eb7daa Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 16 Aug 2021 15:41:03 -0500 Subject: [PATCH 008/171] Temporarily pin RMM while refactor removes deprecated calls (#315) This will unblock cuGraph's build. Authors: - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/raft/pull/315 --- cpp/cmake/thirdparty/get_rmm.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index 51f959a8d9..85ebc6238e 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -32,8 +32,8 @@ function(find_and_configure_rmm VERSION) INSTALL_EXPORT_SET raft-exports CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/rmm.git - GIT_TAG branch-${MAJOR_AND_MINOR} - GIT_SHALLOW TRUE + GIT_TAG 23bbe745af1d988224b5498f7b8e3fe3720532d4 + GIT_SHALLOW FALSE OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}" From 8992816ea79f1404bd7aafc9b729f32df4c5e4f6 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 25 Aug 2021 18:19:27 +0200 Subject: [PATCH 009/171] Update to UCX-Py 0.22 (#319) Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Jordan Jacobelli (https://github.com/Ethyling) URL: https://github.com/rapidsai/raft/pull/319 --- ci/gpu/build.sh | 2 +- ci/local/old-gpubuild.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a62354883b..0ba9901107 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -51,7 +51,7 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "rmm=${MINOR_VERSION}" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ - "ucx-py=0.21.*" \ + "ucx-py=0.22.*" \ "rapids-build-env=${MINOR_VERSION}.*" \ "rapids-notebook-env=${MINOR_VERSION}.*" \ "rapids-doc-env=${MINOR_VERSION}.*" diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh index eb2667149d..efd6c0382a 100644 --- a/ci/local/old-gpubuild.sh +++ b/ci/local/old-gpubuild.sh @@ -56,7 +56,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "distributed>=2.12.0" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ - "ucx-py=0.21.*" + "ucx-py=0.22.*" if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ From aab9b958399fee343e6ac9d476fd18fba4df04f8 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com> Date: Thu, 26 Aug 2021 04:25:22 +0530 Subject: [PATCH 010/171] Add Hamming, Jensen-Shannon, KL-Divergence, Russell rao and Correlation distance metrics support (#306) This PR introduces the following distances: - Hamming - Jensen-Shannon - Russell-Rao - KL-Divergence - Correlation with unit tests for each of them. Authors: - Mahesh Doijade (https://github.com/mdoijade) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/raft/pull/306 --- cpp/include/raft/distance/correlation.cuh | 247 +++++++++++++++++++ cpp/include/raft/distance/distance.cuh | 107 +++++++- cpp/include/raft/distance/hamming.cuh | 175 +++++++++++++ cpp/include/raft/distance/jensen_shannon.cuh | 181 ++++++++++++++ cpp/include/raft/distance/kl_divergence.cuh | 242 ++++++++++++++++++ cpp/include/raft/distance/russell_rao.cuh | 171 +++++++++++++ cpp/test/CMakeLists.txt | 5 + cpp/test/distance/dist_correlation.cu | 69 ++++++ cpp/test/distance/dist_hamming.cu | 69 ++++++ cpp/test/distance/dist_jensen_shannon.cu | 69 ++++++ cpp/test/distance/dist_kl_divergence.cu | 69 ++++++ cpp/test/distance/dist_russell_rao.cu | 69 ++++++ cpp/test/distance/distance_base.cuh | 163 +++++++++++- 13 files changed, 1632 insertions(+), 4 deletions(-) create mode 100644 cpp/include/raft/distance/correlation.cuh create mode 100644 cpp/include/raft/distance/hamming.cuh create mode 100644 cpp/include/raft/distance/jensen_shannon.cuh create mode 100644 cpp/include/raft/distance/kl_divergence.cuh create mode 100644 cpp/include/raft/distance/russell_rao.cuh create mode 100644 cpp/test/distance/dist_correlation.cu create mode 100644 cpp/test/distance/dist_hamming.cu create mode 100644 cpp/test/distance/dist_jensen_shannon.cu create mode 100644 cpp/test/distance/dist_kl_divergence.cu create mode 100644 cpp/test/distance/dist_russell_rao.cu diff --git a/cpp/include/raft/distance/correlation.cuh b/cpp/include/raft/distance/correlation.cuh new file mode 100644 index 0000000000..ed3b7a5464 --- /dev/null +++ b/cpp/include/raft/distance/correlation.cuh @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include + +namespace raft { +namespace distance { + +/** + * @brief the Correlation distance matrix: + * + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] dOutput output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + */ +template +static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn, + const DataT *yn, const DataT *x2n, const DataT *y2n, + IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, + IdxT ldd, OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + acc += x * y; + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [x2n, y2n, m, n, k] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { + DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh]; + + extern __shared__ char smem[]; + DataT *sx2Norm = + (DataT *)(&smem[KPolicy::SmemSize + + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]); + DataT *sy2Norm = (&sx2Norm[KPolicy::Mblk]); + + // Load x & y norms required by this threadblock in shmem buffer + if (gridStrideX == blockIdx.x * KPolicy::Nblk) { + for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) { + auto idx = gridStrideY + i; + sx2Norm[i] = idx < m ? x2n[idx] : 0; + } + } + + for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) { + auto idx = gridStrideX + i; + sy2Norm[i] = idx < n ? y2n[idx] : 0; + } + __syncthreads(); + +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { + regx2n[i] = + sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)]; + } +#pragma unroll + for (int i = 0; i < KPolicy::AccColsPerTh; ++i) { + regy2n[i] = + sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)]; + } + +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { + auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); + auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]); + auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]); + + acc[i][j] = 1 - (numer / raft::mySqrt(Q_denom * R_denom)); + } + } + }; + + constexpr size_t shmemSize = + KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); + if (isRowMajor) { + constexpr auto correlationRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + correlationRowMajor); + correlationRowMajor<<>>( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); + } else { + constexpr auto correlationColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + correlationColMajor); + correlationColMajor<<>>( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, const DataT *xn, + const DataT *yn, const DataT *x2n, const DataT *y2n, + OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + correlationImpl(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, + dOutput, fin_op, stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + correlationImpl(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, + dOutput, fin_op, stream); + } else { + correlationImpl( + x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the Correlation distance matrix calculation + * + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param m number of rows of A and C/D + * @param n number of columns of B and C/D + * @param k number of cols of A and rows of B + * @param pA input matrix + * @param pB input matrix + * @param pD output matrix + * @param fin_op the final element-wise epilogue lambda + * @param stream cuda stream where to launch work + * @param isRowMajor whether the input and output matrices are row major + */ +template +void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, AccType *workspace, size_t &worksize, + FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + correlationOutType; + Index_ lda, ldb, ldd; + correlationOutType *pDcast = reinterpret_cast(pD); + + ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) || + (worksize < 2 * m * sizeof(AccType))), + "workspace size error"); + ASSERT(workspace != nullptr, "workspace is null"); + + AccType *norm_col_vec = workspace; + AccType *norm_row_vec = workspace; + AccType *sq_norm_col_vec = workspace; + AccType *sq_norm_row_vec = workspace; + if (pA != pB) { + norm_row_vec += m; + + raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true, + stream, false, raft::Nop(), + raft::Sum()); + raft::linalg::reduce(norm_row_vec, pB, k, n, (AccType)0, isRowMajor, true, + stream, false, raft::Nop(), + raft::Sum()); + + sq_norm_col_vec += (m + n); + sq_norm_row_vec = sq_norm_col_vec + m; + raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, + isRowMajor, stream); + raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, + isRowMajor, stream); + } else { + raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true, + stream, false, raft::Nop(), + raft::Sum()); + sq_norm_col_vec += m; + sq_norm_row_vec = sq_norm_col_vec; + raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, + isRowMajor, stream); + } + + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + correlation( + m, n, k, lda, ldb, ldd, pA, pB, norm_col_vec, norm_row_vec, + sq_norm_col_vec, sq_norm_row_vec, pDcast, fin_op, stream); + } else { + lda = n, ldb = m, ldd = m; + correlation(n, m, k, lda, ldb, ldd, pB, pA, norm_row_vec, + norm_col_vec, sq_norm_row_vec, sq_norm_col_vec, pDcast, + fin_op, stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index fc0d07773f..02d8fb6d03 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -21,11 +21,16 @@ #include #include #include +#include #include #include +#include #include +#include +#include #include #include +#include #include namespace raft { @@ -171,6 +176,72 @@ struct DistanceImpl +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { + raft::distance::hammingUnexpandedImpl(m, n, k, x, y, dist, fin_op, + stream, isRowMajor); + } +}; + +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { + raft::distance::jensenShannonImpl(m, n, k, x, y, dist, fin_op, + stream, isRowMajor); + } +}; + +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { + raft::distance::russellRaoImpl(m, n, k, x, y, dist, fin_op, stream, + isRowMajor); + } +}; + +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor, InType) { + raft::distance::klDivergenceImpl(m, n, k, x, y, dist, fin_op, + stream, isRowMajor); + } +}; + +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType) { + raft::distance::correlationImpl(m, n, k, x, y, dist, + (AccType *)workspace, worksize, + fin_op, stream, isRowMajor); + } +}; + } // anonymous namespace /** @@ -195,11 +266,16 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, Index_ k) { size_t worksize = 0; constexpr bool is_allocated = - distanceType <= raft::distance::DistanceType::CosineExpanded; + (distanceType <= raft::distance::DistanceType::CosineExpanded) || + (distanceType == raft::distance::DistanceType::CorrelationExpanded); + constexpr int numOfBuffers = + (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1; + if (is_allocated) { - worksize += m * sizeof(AccType); - if (x != y) worksize += n * sizeof(AccType); + worksize += numOfBuffers * m * sizeof(AccType); + if (x != y) worksize += numOfBuffers * n * sizeof(AccType); } + return worksize; } @@ -366,6 +442,31 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, raft::distance::DistanceType::Canberra>( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; + case raft::distance::DistanceType::HammingUnexpanded: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; + case raft::distance::DistanceType::JensenShannon: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; + case raft::distance::DistanceType::RusselRaoExpanded: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; + case raft::distance::DistanceType::KLDivergence: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; + case raft::distance::DistanceType::CorrelationExpanded: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; diff --git a/cpp/include/raft/distance/hamming.cuh b/cpp/include/raft/distance/hamming.cuh new file mode 100644 index 0000000000..08f1020b85 --- /dev/null +++ b/cpp/include/raft/distance/hamming.cuh @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft { +namespace distance { + +/** + * @brief the Hamming distance matrix using the unexpanded form: + * It computes the following equation: + Cij = sum(x_i != y_i) / k + * + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] dOutput output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + */ +template +static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m, + IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + acc += (x != y); + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [k] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { + const DataT one_over_k = DataT(1.0) / k; +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { + acc[i][j] *= one_over_k; + } + } + }; + + if (isRowMajor) { + auto hammingUnexpandedRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + hammingUnexpandedRowMajor); + + hammingUnexpandedRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } else { + auto hammingUnexpandedColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + hammingUnexpandedColMajor); + hammingUnexpandedColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + hammingUnexpandedImpl(x, y, m, n, k, lda, ldb, ldd, + dOutput, fin_op, stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + hammingUnexpandedImpl(x, y, m, n, k, lda, ldb, ldd, + dOutput, fin_op, stream); + } else { + hammingUnexpandedImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the Hamming Unexpanded distance matrix calculation + * It computes the following equation: + Cij = sum(x_i != y_i) / k + * + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param m number of rows of A and C/D + * @param n number of columns of B and C/D + * @param k number of cols of A and rows of B + * @param pA input matrix + * @param pB input matrix + * @param pD output matrix + * @param fin_op the final element-wise epilogue lambda + * @param stream cuda stream where to launch work + * @param isRowMajor whether the input and output matrices are row major + */ +template +void hammingUnexpandedImpl(int m, int n, int k, const InType *pA, + const InType *pB, OutType *pD, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + hammingUnexpandedOutType; + Index_ lda, ldb, ldd; + hammingUnexpandedOutType *pDcast = + reinterpret_cast(pD); + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + hammingUnexpanded(m, n, k, lda, ldb, ldd, pA, pB, pDcast, + fin_op, stream); + + } else { + lda = n, ldb = m, ldd = m; + hammingUnexpanded(n, m, k, lda, ldb, ldd, pB, pA, + pDcast, fin_op, stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/jensen_shannon.cuh b/cpp/include/raft/distance/jensen_shannon.cuh new file mode 100644 index 0000000000..2a94205853 --- /dev/null +++ b/cpp/include/raft/distance/jensen_shannon.cuh @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft { +namespace distance { + +/** + * @brief the Jensen Shannon distance matrix: + * It computes the following equation: + Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) + + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) + * + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] dOutput output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + */ +template +static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, + IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + const DataT m = 0.5f * (x + y); + const bool m_zero = (m == 0); + const auto logM = (!m_zero) * raft::myLog(m + m_zero); + + const bool x_zero = (x == 0); + const bool y_zero = (y == 0); + acc += (-x * (logM - raft::myLog(x + x_zero))) + + (-y * (logM - raft::myLog(y + y_zero))); + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { + acc[i][j] = raft::mySqrt(0.5 * acc[i][j]); + } + } + }; + + if (isRowMajor) { + auto jensenShannonRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + jensenShannonRowMajor); + + jensenShannonRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } else { + auto jensenShannonColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + jensenShannonColMajor); + jensenShannonColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + jensenShannonImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + jensenShannonImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else { + jensenShannonImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the Jensen Shannon distance matrix calculation + * It computes the following equation: + Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) + + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) + * + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param m number of rows of A and C/D + * @param n number of columns of B and C/D + * @param k number of cols of A and rows of B + * @param pA input matrix + * @param pB input matrix + * @param pD output matrix + * @param fin_op the final element-wise epilogue lambda + * @param stream cuda stream where to launch work + * @param isRowMajor whether the input and output matrices are row major + */ +template +void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + jensenShannonOutType; + Index_ lda, ldb, ldd; + jensenShannonOutType *pDcast = reinterpret_cast(pD); + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + jensenShannon(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + + } else { + lda = n, ldb = m, ldd = m; + jensenShannon(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, + stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/kl_divergence.cuh b/cpp/include/raft/distance/kl_divergence.cuh new file mode 100644 index 0000000000..3197b73d10 --- /dev/null +++ b/cpp/include/raft/distance/kl_divergence.cuh @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft { +namespace distance { + +/** + * @brief the KL Divergence distance matrix: + * It computes the following equation: + Cij = 0.5 * sum(x * log (x / y)); + * This distance computation modifies A or B by computing a log(x) + * and then performing a `pow(e, log(x))` to convert it back. Because of this, + * it is possible that the values in A or B might differ slightly + * after this is invoked. + * + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] dOutput output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + */ +template +static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, + IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + if (isRowMajor) { + const bool x_zero = (x == 0); + acc += x * (raft::myLog(x + x_zero) - y); + } else { + const bool y_zero = (y == 0); + acc += y * (raft::myLog(y + y_zero) - x); + } + }; + + auto core_lambda_x_equal_y = [] __device__(AccT & acc, DataT & x, DataT & y) { + if (isRowMajor) { + const bool x_zero = (x == 0); + const bool y_zero = (y == 0); + acc += + x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero)); + } else { + const bool y_zero = (y == 0); + const bool x_zero = (x == 0); + acc += + y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero)); + } + }; + + auto unaryOp_lambda = [] __device__(DataT input) { + const bool x_zero = (input == 0); + return (!x_zero) * raft::myLog(input + x_zero); + }; + + auto unaryOp_lambda_reverse = [] __device__(DataT input) { + // reverse previous log (x) back to x using (e ^ log(x)) + const bool x_zero = (input == 0); + return (!x_zero) * raft::myExp(input); + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { + acc[i][j] = (0.5f * acc[i][j]); + } + } + }; + + if (isRowMajor) { + constexpr auto klDivergenceRowMajor = + pairwiseDistanceMatKernel; + constexpr auto klDivergenceRowMajorXequalY = + pairwiseDistanceMatKernel; + if (x != y) { + raft::linalg::unaryOp( + (DataT *)y, y, n * k, unaryOp_lambda, stream); + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + klDivergenceRowMajor); + klDivergenceRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + // Now reverse previous log (x) back to x using (e ^ log(x)) + raft::linalg::unaryOp( + (DataT *)y, y, n * k, unaryOp_lambda_reverse, stream); + } else { + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + klDivergenceRowMajorXequalY); + klDivergenceRowMajorXequalY<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, + core_lambda_x_equal_y, epilog_lambda, fin_op); + } + } else { + constexpr auto klDivergenceColMajor = + pairwiseDistanceMatKernel; + constexpr auto klDivergenceColMajorXequalY = + pairwiseDistanceMatKernel; + if (x != y) { + raft::linalg::unaryOp( + (DataT *)x, x, m * k, unaryOp_lambda, stream); + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + klDivergenceColMajor); + klDivergenceColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + // Now reverse previous log (x) back to x using (e ^ log(x)) + raft::linalg::unaryOp( + (DataT *)x, x, m * k, unaryOp_lambda_reverse, stream); + } else { + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + klDivergenceColMajorXequalY); + klDivergenceColMajorXequalY<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, + core_lambda_x_equal_y, epilog_lambda, fin_op); + } + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + klDivergenceImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + klDivergenceImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else { + klDivergenceImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the KL Divergence distance matrix calculation + * It computes the following equation: + Cij = 0.5 * sum(x * log (x / y)); + * This distance computation modifies A or B by computing a log(x) + * and then performing a `pow(e, log(x))` to convert it back. Because of this, + * it is possible that the values in A or B might differ slightly + * after this is invoked. + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param m number of rows of A and C/D + * @param n number of columns of B and C/D + * @param k number of cols of A and rows of B + * @param pA input matrix + * @param pB input matrix + * @param pD output matrix + * @param fin_op the final element-wise epilogue lambda + * @param stream cuda stream where to launch work + * @param isRowMajor whether the input and output matrices are row major + */ +template +void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + klDivergenceOutType; + Index_ lda, ldb, ldd; + klDivergenceOutType *pDcast = reinterpret_cast(pD); + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + klDivergence(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + + } else { + lda = n, ldb = m, ldd = m; + klDivergence(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/russell_rao.cuh b/cpp/include/raft/distance/russell_rao.cuh new file mode 100644 index 0000000000..417fb73b94 --- /dev/null +++ b/cpp/include/raft/distance/russell_rao.cuh @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft { +namespace distance { + +/** + * @brief the Russell Rao distance matrix: + * It computes the following equation: + Cij = (k - sum(x_i * y_i)) / k + * + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] dOutput output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + */ +template +static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, + IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + acc += x * y; + }; + + const float one_over_k = 1.0 / k; + // epilogue operation lambda for final value calculation + auto epilog_lambda = [k, one_over_k] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { + acc[i][j] = (k - acc[i][j]) * one_over_k; + } + } + }; + + if (isRowMajor) { + constexpr auto russellRaoRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + russellRaoRowMajor); + + russellRaoRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } else { + constexpr auto russellRaoColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + russellRaoColMajor); + russellRaoColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + russellRaoImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + russellRaoImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else { + russellRaoImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the Russell Rao distance matrix calculation + * It computes the following equation: + Cij = (k - sum(x_i * y_i)) / k + * + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param m number of rows of A and C/D + * @param n number of columns of B and C/D + * @param k number of cols of A and rows of B + * @param pA input matrix + * @param pB input matrix + * @param pD output matrix + * @param fin_op the final element-wise epilogue lambda + * @param stream cuda stream where to launch work + * @param isRowMajor whether the input and output matrices are row major + */ +template +void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + russellRaoOutType; + Index_ lda, ldb, ldd; + russellRaoOutType *pDcast = reinterpret_cast(pD); + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + russellRao( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + + } else { + lda = n, ldb = m, ldd = m; + russellRao( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index f94a8d9525..0428e09142 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -21,12 +21,17 @@ add_executable(test_raft test/distance/dist_adj.cu test/distance/dist_canberra.cu test/distance/dist_chebyshev.cu + test/distance/dist_correlation.cu test/distance/dist_cos.cu test/distance/dist_euc_exp.cu test/distance/dist_euc_unexp.cu + test/distance/dist_hamming.cu test/distance/dist_hellinger.cu + test/distance/dist_jensen_shannon.cu + test/distance/dist_kl_divergence.cu test/distance/dist_l1.cu test/distance/dist_minkowski.cu + test/distance/dist_russell_rao.cu test/distance/fused_l2_nn.cu test/eigen_solvers.cu test/handle.cpp diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu new file mode 100644 index 0000000000..5d84f18e52 --- /dev/null +++ b/cpp/test/distance/dist_correlation.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceCorrelation + : public DistanceTest {}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceCorrelation DistanceCorrelationF; +TEST_P(DistanceCorrelationF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceCorrelation DistanceCorrelationD; +TEST_P(DistanceCorrelationD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu new file mode 100644 index 0000000000..47febd825b --- /dev/null +++ b/cpp/test/distance/dist_hamming.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceHamming + : public DistanceTest {}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceHamming DistanceHammingF; +TEST_P(DistanceHammingF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceHamming DistanceHammingD; +TEST_P(DistanceHammingD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu new file mode 100644 index 0000000000..bc0b56f506 --- /dev/null +++ b/cpp/test/distance/dist_jensen_shannon.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceJensenShannon + : public DistanceTest { +}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceJensenShannon DistanceJensenShannonF; +TEST_P(DistanceJensenShannonF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceJensenShannon DistanceJensenShannonD; +TEST_P(DistanceJensenShannonD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu new file mode 100644 index 0000000000..884ac4b948 --- /dev/null +++ b/cpp/test/distance/dist_kl_divergence.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceKLDivergence + : public DistanceTest { +}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceKLDivergence DistanceKLDivergenceF; +TEST_P(DistanceKLDivergenceF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceKLDivergence DistanceKLDivergenceD; +TEST_P(DistanceKLDivergenceD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu new file mode 100644 index 0000000000..74ccfb0c2e --- /dev/null +++ b/cpp/test/distance/dist_russell_rao.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceRussellRao + : public DistanceTest {}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceRussellRao DistanceRussellRaoF; +TEST_P(DistanceRussellRaoF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceRussellRao DistanceRussellRaoD; +TEST_P(DistanceRussellRaoD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index fc7b064205..9e3290593d 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -160,6 +160,138 @@ __global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x, dist[outidx] = acc; } +template +__global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor) { + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) return; + DataType acc = DataType(0); + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + acc += (a != b); + } + acc = acc / k; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; +} + +template +__global__ void naiveJensenShannonDistanceKernel(DataType *dist, + const DataType *x, + const DataType *y, int m, + int n, int k, + bool isRowMajor) { + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) return; + DataType acc = DataType(0); + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + + DataType m = 0.5f * (a + b); + bool a_zero = a == 0; + bool b_zero = b == 0; + + DataType p = (!a_zero * m) / (a_zero + a); + DataType q = (!b_zero * m) / (b_zero + b); + + bool p_zero = p == 0; + bool q_zero = q == 0; + + acc += + (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero))); + } + acc = raft::mySqrt(0.5f * acc); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; +} + +template +__global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor) { + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) return; + OutType acc = OutType(0); + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + acc += (a * b); + } + acc = (k - acc) / k; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; +} + +template +__global__ void naiveKLDivergenceDistanceKernel(OutType *dist, + const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor) { + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) return; + OutType acc = OutType(0); + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + bool b_zero = (b == 0); + const auto m = (!b_zero) * (a / b); + const bool m_zero = (m == 0); + acc += (a * (!m_zero) * log(m + m_zero)); + } + acc = 0.5f * acc; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; +} + +template +__global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor) { + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) return; + OutType acc = OutType(0); + auto a_norm = DataType(0); + auto b_norm = DataType(0); + auto a_sq_norm = DataType(0); + auto b_sq_norm = DataType(0); + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + a_norm += a; + b_norm += b; + a_sq_norm += (a * a); + b_sq_norm += (b * b); + acc += (a * b); + } + + auto numer = k * acc - (a_norm * b_norm); + auto Q_denom = k * a_sq_norm - (a_norm * a_norm); + auto R_denom = k * b_sq_norm - (b_norm * b_norm); + + acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom)); + + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; +} + template void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, raft::distance::DistanceType type, @@ -193,6 +325,26 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, naiveLpUnexpDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor, metric_arg); break; + case raft::distance::DistanceType::HammingUnexpanded: + naiveHammingDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); + break; + case raft::distance::DistanceType::JensenShannon: + naiveJensenShannonDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); + break; + case raft::distance::DistanceType::RusselRaoExpanded: + naiveRussellRaoDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); + break; + case raft::distance::DistanceType::KLDivergence: + naiveKLDivergenceDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); + break; + case raft::distance::DistanceType::CorrelationExpanded: + naiveCorrelationDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); + break; default: FAIL() << "should be here\n"; } @@ -247,10 +399,19 @@ class DistanceTest : public ::testing::TestWithParam> { raft::allocate(dist_ref, m * n); raft::allocate(dist, m * n); raft::allocate(dist2, m * n); - if (distanceType == raft::distance::DistanceType::HellingerExpanded) { + if (distanceType == raft::distance::DistanceType::HellingerExpanded || + distanceType == raft::distance::DistanceType::JensenShannon || + distanceType == raft::distance::DistanceType::KLDivergence) { // Hellinger works only on positive numbers r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream); + } else if (distanceType == + raft::distance::DistanceType::RusselRaoExpanded) { + r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream); + r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream); + // Russel rao works on boolean values. + r.bernoulli(x, m * k, 0.5f, stream); + r.bernoulli(y, n * k, 0.5f, stream); } else { r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); From 1c4e1e600b8b5da4265290f031e7785e824b706e Mon Sep 17 00:00:00 2001 From: Victor Lafargue Date: Fri, 27 Aug 2021 17:26:55 +0200 Subject: [PATCH 011/171] Miscellaneous tech debts/cleanups (#286) Miscellaneous updates to solve tech debts in RAFT : - [x] Removal of handle host and device allocators - [x] Addition of a `get_thrust_policy` method to the handle - [x] Usage of `get_thrust_policy` where handle is available - [x] Removal of `rmm::device_vector` - [x] Use of RMM device allocator in the `raft::allocate` function - [x] Creation of an allocation + deallocation helper system - [x] Usage of `rmm::exec_policy` instead of `thrust::cuda::par.on` when no handle is available Authors: - Victor Lafargue (https://github.com/viclafargue) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/286 --- cpp/cmake/thirdparty/get_rmm.cmake | 2 +- cpp/include/raft/common/cub_wrappers.cuh | 4 +- cpp/include/raft/comms/helper.hpp | 10 +- cpp/include/raft/comms/std_comms.hpp | 40 ++---- cpp/include/raft/comms/test.hpp | 41 ++---- cpp/include/raft/cudart_utils.h | 46 +++++-- cpp/include/raft/distance/distance.cuh | 7 +- cpp/include/raft/handle.hpp | 30 +--- cpp/include/raft/label/classlabels.cuh | 57 ++++---- cpp/include/raft/lap/lap.cuh | 87 ++++++------ cpp/include/raft/lap/lap_functions.cuh | 47 +++---- .../raft/linalg/cholesky_r1_update.cuh | 8 +- cpp/include/raft/linalg/eig.cuh | 28 ++-- cpp/include/raft/linalg/init.h | 3 +- cpp/include/raft/linalg/qr.cuh | 19 ++- cpp/include/raft/linalg/svd.cuh | 28 ++-- cpp/include/raft/linalg/transpose.h | 4 +- cpp/include/raft/matrix/math.cuh | 10 +- cpp/include/raft/matrix/matrix.cuh | 14 +- cpp/include/raft/random/rng.cuh | 17 +-- cpp/include/raft/sparse/convert/csr.cuh | 24 ++-- cpp/include/raft/sparse/coo.cuh | 38 +++-- cpp/include/raft/sparse/csr.cuh | 19 +-- .../raft/sparse/distance/bin_distance.cuh | 11 +- cpp/include/raft/sparse/distance/coo_spmv.cuh | 3 - .../coo_spmv_strategies/base_strategy.cuh | 2 - .../coo_mask_row_iterators.cuh | 1 - .../coo_spmv_strategies/hash_strategy.cuh | 2 +- cpp/include/raft/sparse/distance/distance.cuh | 1 - .../raft/sparse/distance/ip_distance.cuh | 3 +- .../raft/sparse/distance/l2_distance.cuh | 21 +-- cpp/include/raft/sparse/distance/utils.cuh | 2 - .../sparse/hierarchy/detail/agglomerative.cuh | 13 +- .../hierarchy/detail/connectivities.cuh | 13 +- .../raft/sparse/hierarchy/detail/mst.cuh | 16 +-- .../raft/sparse/hierarchy/single_linkage.hpp | 1 - cpp/include/raft/sparse/linalg/add.cuh | 10 +- cpp/include/raft/sparse/linalg/spectral.cuh | 23 ++-- cpp/include/raft/sparse/linalg/symmetrize.cuh | 33 ++--- cpp/include/raft/sparse/linalg/transpose.h | 12 +- .../raft/sparse/mst/detail/mst_solver_inl.cuh | 122 ++++++++-------- cpp/include/raft/sparse/mst/detail/utils.cuh | 4 +- cpp/include/raft/sparse/mst/mst_solver.cuh | 24 ++-- cpp/include/raft/sparse/op/filter.cuh | 36 ++--- cpp/include/raft/sparse/op/reduce.cuh | 10 +- cpp/include/raft/sparse/op/sort.h | 21 +-- .../sparse/selection/connect_components.cuh | 36 ++--- cpp/include/raft/sparse/selection/knn.cuh | 2 - .../raft/sparse/selection/knn_graph.cuh | 1 - cpp/include/raft/spatial/knn/ann.hpp | 3 - .../knn/detail/ann_quantized_faiss.cuh | 8 +- .../knn/detail/knn_brute_force_faiss.cuh | 19 ++- .../raft/spatial/knn/detail/processing.hpp | 33 ++--- cpp/include/raft/spatial/knn/knn.hpp | 6 +- cpp/include/raft/spectral/cluster_solvers.hpp | 9 +- cpp/include/raft/spectral/kmeans.hpp | 130 ++++++++---------- cpp/include/raft/spectral/matrix_wrappers.hpp | 83 ++++------- .../raft/spectral/modularity_maximization.hpp | 29 ++-- cpp/include/raft/spectral/partition.hpp | 29 ++-- cpp/include/raft/spectral/spectral_util.hpp | 23 ++-- cpp/test/cluster_solvers.cu | 9 +- cpp/test/distance/dist_adj.cu | 22 ++- cpp/test/distance/distance_base.cuh | 24 ++-- cpp/test/distance/fused_l2_nn.cu | 32 ++--- cpp/test/eigen_solvers.cu | 9 +- cpp/test/label/label.cu | 34 ++--- cpp/test/label/merge_labels.cu | 6 +- cpp/test/lap/lap.cu | 17 ++- cpp/test/linalg/add.cu | 14 +- cpp/test/linalg/binary_op.cu | 16 +-- cpp/test/linalg/cholesky_r1.cu | 27 ++-- cpp/test/linalg/coalesced_reduction.cu | 15 +- cpp/test/linalg/divide.cu | 15 +- cpp/test/linalg/eig.cu | 34 ++--- cpp/test/linalg/eig_sel.cu | 19 +-- cpp/test/linalg/eltwise.cu | 14 +- cpp/test/linalg/map.cu | 25 ++-- cpp/test/linalg/map_then_reduce.cu | 26 ++-- cpp/test/linalg/matrix_vector_op.cu | 21 ++- cpp/test/linalg/multiply.cu | 15 +- cpp/test/linalg/norm.cu | 18 ++- cpp/test/linalg/reduce.cu | 10 +- cpp/test/linalg/reduce.cuh | 15 +- cpp/test/linalg/strided_reduction.cu | 10 +- cpp/test/linalg/subtract.cu | 18 ++- cpp/test/linalg/svd.cu | 28 ++-- cpp/test/linalg/transpose.cu | 12 +- cpp/test/linalg/unary_op.cu | 11 +- cpp/test/matrix/math.cu | 51 +++---- cpp/test/matrix/matrix.cu | 34 +++-- cpp/test/mr/device/buffer.cpp | 18 ++- cpp/test/mst.cu | 24 +++- cpp/test/random/rng.cu | 41 +++--- cpp/test/random/rng_int.cu | 12 +- cpp/test/random/sample_without_replacement.cu | 14 +- cpp/test/sparse/add.cu | 43 ++---- cpp/test/sparse/connect_components.cu | 11 +- cpp/test/sparse/convert_coo.cu | 10 +- cpp/test/sparse/convert_csr.cu | 26 ++-- cpp/test/sparse/csr_row_slice.cu | 34 ++--- cpp/test/sparse/csr_to_dense.cu | 22 +-- cpp/test/sparse/csr_transpose.cu | 36 ++--- cpp/test/sparse/degree.cu | 36 ++--- cpp/test/sparse/dist_coo_spmv.cu | 29 ++-- cpp/test/sparse/distance.cu | 21 +-- cpp/test/sparse/filter.cu | 13 +- cpp/test/sparse/knn.cu | 27 ++-- cpp/test/sparse/knn_graph.cu | 16 +-- cpp/test/sparse/linkage.cu | 39 ++---- cpp/test/sparse/norm.cu | 15 +- cpp/test/sparse/reduce.cu | 3 +- cpp/test/sparse/row_op.cu | 12 +- cpp/test/sparse/selection.cu | 24 +--- cpp/test/sparse/sort.cu | 13 +- cpp/test/sparse/symmetrize.cu | 33 ++--- cpp/test/spatial/haversine.cu | 33 ++--- cpp/test/spatial/knn.cu | 34 +++-- cpp/test/spectral_matrix.cu | 19 ++- cpp/test/stats/mean.cu | 10 +- cpp/test/stats/mean_center.cu | 18 ++- cpp/test/stats/stddev.cu | 18 ++- cpp/test/stats/sum.cu | 12 +- python/raft/common/handle.pxd | 2 - 123 files changed, 1114 insertions(+), 1583 deletions(-) diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index 85ebc6238e..e990ab1367 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -44,4 +44,4 @@ endfunction() set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00") -find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) +find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) \ No newline at end of file diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 8d5b29f700..8e3519fea5 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft { @@ -34,7 +34,7 @@ namespace raft { * @param stream cuda stream */ template -void sortPairs(raft::mr::device::buffer &workspace, const KeyT *inKeys, +void sortPairs(rmm::device_uvector &workspace, const KeyT *inKeys, KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, cudaStream_t stream) { size_t worksize; diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 7b24e31bbe..e01490d728 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -38,11 +38,10 @@ namespace comms { */ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, int num_ranks, int rank) { - auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream))); + new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream))); handle->set_comms(communicator); } @@ -80,12 +79,11 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, } } - auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, - num_ranks, rank, d_alloc, stream))); + auto communicator = std::make_shared( + std::unique_ptr(new raft::comms::std_comms( + nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream))); handle->set_comms(communicator); } diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 765e8741bb..ff75931fb9 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include @@ -64,17 +64,16 @@ class std_comms : public comms_iface { */ std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, std::shared_ptr eps, int num_ranks, int rank, - const std::shared_ptr device_allocator, cudaStream_t stream, bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), + status_(2, stream), num_ranks_(num_ranks), rank_(rank), subcomms_ucp_(subcomms_ucp), ucp_worker_(ucp_worker), ucp_eps_(eps), - next_request_id_(0), - device_allocator_(device_allocator) { + next_request_id_(0) { initialize(); }; @@ -85,27 +84,19 @@ class std_comms : public comms_iface { * @param rank rank of the current worker */ std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, - const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), stream_(stream), + status_(2, stream), num_ranks_(num_ranks), rank_(rank), - subcomms_ucp_(false), - device_allocator_(device_allocator) { + subcomms_ucp_(false) { initialize(); }; - virtual ~std_comms() { - device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); - device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); - } - void initialize() { - sendbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); - recvbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); + sendbuff_ = status_.data(); + recvbuff_ = status_.data() + 1; } int get_size() const { return num_ranks_; } @@ -113,8 +104,8 @@ class std_comms : public comms_iface { int get_rank() const { return rank_; } std::unique_ptr comm_split(int color, int key) const { - mr::device::buffer d_colors(device_allocator_, stream_, get_size()); - mr::device::buffer d_keys(device_allocator_, stream_, get_size()); + rmm::device_uvector d_colors(get_size(), stream_); + rmm::device_uvector d_keys(get_size(), stream_); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); @@ -167,12 +158,12 @@ class std_comms : public comms_iface { if (ucp_worker_ != nullptr && subcomms_ucp_) { auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr(new std_comms( - nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key, - device_allocator_, stream_, subcomms_ucp_)); + return std::unique_ptr( + new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, + subcomm_ranks.size(), key, stream_, subcomms_ucp_)); } else { - return std::unique_ptr(new std_comms( - nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); + return std::unique_ptr( + new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_)); } } @@ -465,6 +456,7 @@ class std_comms : public comms_iface { cudaStream_t stream_; int *sendbuff_, *recvbuff_; + rmm::device_uvector status_; int num_ranks_; int rank_; @@ -478,8 +470,6 @@ class std_comms : public comms_iface { mutable std::unordered_map requests_in_flight_; mutable std::unordered_set free_requests_; - - std::shared_ptr device_allocator_; }; } // end namespace comms } // end namespace raft diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 4e95c4eef0..17db8e88af 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -44,8 +43,7 @@ bool test_collective_allreduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_scalar temp_d(stream); CUDA_CHECK( cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); @@ -76,8 +74,7 @@ bool test_collective_broadcast(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_scalar temp_d(stream); if (communicator.get_rank() == root) CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), @@ -104,8 +101,7 @@ bool test_collective_reduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_scalar temp_d(stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -134,11 +130,8 @@ bool test_collective_allgather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); - - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - communicator.get_size()); + rmm::device_scalar temp_d(stream); + rmm::device_uvector recv_d(communicator.get_size(), stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -169,12 +162,9 @@ bool test_collective_gather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); - - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? communicator.get_size() : 0); + rmm::device_scalar temp_d(stream); + rmm::device_uvector recv_d( + communicator.get_rank() == root ? communicator.get_size() : 0, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -211,12 +201,9 @@ bool test_collective_gatherv(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(sends.size(), stream); - - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? displacements.back() : 0); + rmm::device_uvector temp_d(sends.size(), stream); + rmm::device_uvector recv_d( + communicator.get_rank() == root ? displacements.back() : 0, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, @@ -256,10 +243,8 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, - sends.size()); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - 1); + rmm::device_uvector temp_d(sends.size(), stream); + rmm::device_scalar recv_d(stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 86c60addf2..85ca310530 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -17,6 +17,8 @@ #pragma once #include +#include +#include #include @@ -25,6 +27,8 @@ #include #include #include +#include +#include ///@todo: enable once logging has been enabled in raft //#include "logger.hpp" @@ -200,7 +204,8 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { +void copy(Type* dst, const Type* src, size_t len, + rmm::cuda_stream_view stream) { CUDA_CHECK( cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } @@ -214,20 +219,20 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { /** performs a host to device copy */ template void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - cudaStream_t stream) { + rmm::cuda_stream_view stream) { copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - cudaStream_t stream) { + rmm::cuda_stream_view stream) { copy(h_ptr, d_ptr, len, stream); } template void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - cudaStream_t stream) { + rmm::cuda_stream_view stream) { CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } @@ -259,11 +264,36 @@ void print_device_vector(const char* variable_name, const T* devMem, } /** @} */ -/** cuda malloc */ +static std::mutex mutex_; +static std::unordered_map allocations; + +template +void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, + bool setZero = false) { + size_t size = len * sizeof(Type); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); + + std::lock_guard _(mutex_); + allocations[ptr] = size; +} + template -void allocate(Type*& ptr, size_t len, bool setZero = false) { - CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len)); - if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len)); +void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + size_t size = allocations[ptr]; + allocations.erase(ptr); + rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); +} + +inline void deallocate_all(rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + for (auto& alloc : allocations) { + void* ptr = alloc.first; + size_t size = alloc.second; + rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); + } + allocations.clear(); } /** helper method to get max usable shared mem per block parameter */ diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 02d8fb6d03..65b4f3b830 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -31,7 +31,7 @@ #include #include #include -#include +#include namespace raft { namespace distance { @@ -376,7 +376,7 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, template void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, - raft::mr::device::buffer &workspace, + rmm::device_uvector &workspace, cudaStream_t stream, bool isRowMajor, Type metric_arg = 2.0f) { auto worksize = @@ -389,8 +389,7 @@ void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, template void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - raft::mr::device::buffer &workspace, + Index_ n, Index_ k, rmm::device_uvector &workspace, raft::distance::DistanceType metric, cudaStream_t stream, bool isRowMajor = true, Type metric_arg = 2.0f) { switch (metric) { diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index dbe7e83189..c925669530 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -36,9 +36,8 @@ #include #include #include -#include -#include #include +#include #include "cudart_utils.h" namespace raft { @@ -63,10 +62,9 @@ class handle_t { CUDA_CHECK(cudaGetDevice(&cur_dev)); return cur_dev; }()), - streams_(n_streams), - device_allocator_(std::make_shared()), - host_allocator_(std::make_shared()) { + streams_(n_streams) { create_resources(); + thrust_policy_ = std::make_unique(user_stream_); } /** @@ -86,10 +84,9 @@ class handle_t { "ERROR: the main handle must have at least one worker stream\n"); prop_ = other.get_device_properties(); device_prop_initialized_ = true; - device_allocator_ = other.get_device_allocator(); - host_allocator_ = other.get_host_allocator(); create_resources(); set_stream(other.get_internal_stream(stream_id)); + thrust_policy_ = std::make_unique(user_stream_); } /** Destroys all held-up resources */ @@ -103,20 +100,6 @@ class handle_t { return rmm::cuda_stream_view(user_stream_); } - void set_device_allocator(std::shared_ptr allocator) { - device_allocator_ = allocator; - } - std::shared_ptr get_device_allocator() const { - return device_allocator_; - } - - void set_host_allocator(std::shared_ptr allocator) { - host_allocator_ = allocator; - } - std::shared_ptr get_host_allocator() const { - return host_allocator_; - } - cublasHandle_t get_cublas_handle() const { std::lock_guard _(mutex_); if (!cublas_initialized_) { @@ -153,6 +136,8 @@ class handle_t { return cusparse_handle_; } + rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; } + // legacy compatibility for cuML cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); @@ -236,8 +221,7 @@ class handle_t { mutable bool cusolver_sp_initialized_{false}; mutable cusparseHandle_t cusparse_handle_; mutable bool cusparse_initialized_{false}; - std::shared_ptr device_allocator_; - std::shared_ptr host_allocator_; + std::unique_ptr thrust_policy_{nullptr}; cudaStream_t user_stream_{nullptr}; cudaEvent_t event_; mutable cudaDeviceProp prop_; diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index 0da7da2eb6..b2302836bc 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include namespace raft { namespace label { @@ -36,41 +36,39 @@ namespace label { * \tparam value_t numeric type of the arrays with class labels * \param [in] y device array of labels, size [n] * \param [in] n number of labels - * \param [out] y_unique device array of unique labels, unallocated on entry, + * \param [out] unique device array of unique labels, unallocated on entry, * on exit it has size [n_unique] * \param [out] n_unique number of unique labels * \param [in] stream cuda stream - * \param [in] allocator device allocator */ template -void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, - cudaStream_t stream, - std::shared_ptr allocator) { - raft::mr::device::buffer y2(allocator, stream, n); - raft::mr::device::buffer y3(allocator, stream, n); - raft::mr::device::buffer d_num_selected(allocator, stream, 1); +int getUniquelabels(rmm::device_uvector &unique, value_t *y, size_t n, + cudaStream_t stream) { + rmm::device_scalar d_num_selected(stream); + rmm::device_uvector workspace(n, stream); size_t bytes = 0; size_t bytes2 = 0; // Query how much temporary storage we will need for cub operations // and allocate it - cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), + cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n); + cub::DeviceSelect::Unique(NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n); bytes = max(bytes, bytes2); - raft::mr::device::buffer cub_storage(allocator, stream, bytes); + rmm::device_uvector cub_storage(bytes, stream); // Select Unique classes - cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), - d_num_selected.data(), n); - raft::update_host(n_unique, d_num_selected.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), + n); + cub::DeviceSelect::Unique(cub_storage.data(), bytes, workspace.data(), + workspace.data(), d_num_selected.data(), n); + int n_unique = d_num_selected.value(stream); // Copy unique classes to output - *y_unique = - (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream); - raft::copy(*y_unique, y3.data(), *n_unique, stream); + unique.resize(n_unique, stream); + raft::copy(unique.data(), workspace.data(), n_unique, stream); + + return n_unique; } /** @@ -147,22 +145,17 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, */ template void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - Lambda filter_op, - std::shared_ptr allocator, - bool zero_based = false) { + Lambda filter_op, bool zero_based = false) { static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); dim3 threads(TPB_X); - Type *map_ids; - int num_clusters; - getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator); + rmm::device_uvector map_ids(0, stream); + int num_clusters = getUniquelabels(map_ids, in, N, stream); map_label_kernel<<>>( - map_ids, num_clusters, in, out, N, filter_op, zero_based); - - allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream); + map_ids.data(), num_clusters, in, out, N, filter_op, zero_based); } /** @@ -184,11 +177,9 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, */ template void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - std::shared_ptr allocator, bool zero_based = false) { make_monotonic( - out, in, N, stream, [] __device__(Type val) { return false; }, allocator, - zero_based); + out, in, N, stream, [] __device__(Type val) { return false; }, zero_based); } }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index 6bc1c08029..f64afb3549 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -25,6 +25,7 @@ #pragma once #include +#include #include "d_structs.h" #include "lap_functions.cuh" @@ -44,19 +45,19 @@ class LinearAssignmentProblem { VertexData d_row_data_dev, d_col_data_dev; raft::handle_t const &handle_; - raft::mr::device::buffer row_covers_v; - raft::mr::device::buffer col_covers_v; - raft::mr::device::buffer row_duals_v; - raft::mr::device::buffer col_duals_v; - raft::mr::device::buffer col_slacks_v; - raft::mr::device::buffer row_is_visited_v; - raft::mr::device::buffer col_is_visited_v; - raft::mr::device::buffer row_parents_v; - raft::mr::device::buffer col_parents_v; - raft::mr::device::buffer row_children_v; - raft::mr::device::buffer col_children_v; - raft::mr::device::buffer obj_val_primal_v; - raft::mr::device::buffer obj_val_dual_v; + rmm::device_uvector row_covers_v; + rmm::device_uvector col_covers_v; + rmm::device_uvector row_duals_v; + rmm::device_uvector col_duals_v; + rmm::device_uvector col_slacks_v; + rmm::device_uvector row_is_visited_v; + rmm::device_uvector col_is_visited_v; + rmm::device_uvector row_parents_v; + rmm::device_uvector col_parents_v; + rmm::device_uvector row_children_v; + rmm::device_uvector col_children_v; + rmm::device_uvector obj_val_primal_v; + rmm::device_uvector obj_val_dual_v; public: LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, @@ -66,19 +67,19 @@ class LinearAssignmentProblem { batchsize_(batchsize), epsilon_(epsilon), d_costs_(nullptr), - row_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_slacks_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {} + row_covers_v(0, handle_.get_stream()), + col_covers_v(0, handle_.get_stream()), + row_duals_v(0, handle_.get_stream()), + col_duals_v(0, handle_.get_stream()), + col_slacks_v(0, handle_.get_stream()), + row_is_visited_v(0, handle_.get_stream()), + col_is_visited_v(0, handle_.get_stream()), + row_parents_v(0, handle_.get_stream()), + col_parents_v(0, handle_.get_stream()), + row_children_v(0, handle_.get_stream()), + col_children_v(0, handle_.get_stream()), + obj_val_primal_v(0, handle_.get_stream()), + obj_val_dual_v(0, handle_.get_stream()) {} // Executes Hungarian algorithm on the input cost matrix. void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, @@ -152,19 +153,20 @@ class LinearAssignmentProblem { private: // Helper function for initializing global variables and arrays on a single host. void initializeDevice() { - row_covers_v.resize(batchsize_ * size_); - col_covers_v.resize(batchsize_ * size_); - row_duals_v.resize(batchsize_ * size_); - col_duals_v.resize(batchsize_ * size_); - col_slacks_v.resize(batchsize_ * size_); - row_is_visited_v.resize(batchsize_ * size_); - col_is_visited_v.resize(batchsize_ * size_); - row_parents_v.resize(batchsize_ * size_); - col_parents_v.resize(batchsize_ * size_); - row_children_v.resize(batchsize_ * size_); - col_children_v.resize(batchsize_ * size_); - obj_val_primal_v.resize(batchsize_); - obj_val_dual_v.resize(batchsize_); + cudaStream_t stream = handle_.get_stream(); + row_covers_v.resize(batchsize_ * size_, stream); + col_covers_v.resize(batchsize_ * size_, stream); + row_duals_v.resize(batchsize_ * size_, stream); + col_duals_v.resize(batchsize_ * size_, stream); + col_slacks_v.resize(batchsize_ * size_, stream); + row_is_visited_v.resize(batchsize_ * size_, stream); + col_is_visited_v.resize(batchsize_ * size_, stream); + row_parents_v.resize(batchsize_ * size_, stream); + col_parents_v.resize(batchsize_ * size_, stream); + row_children_v.resize(batchsize_ * size_, stream); + col_children_v.resize(batchsize_ * size_, stream); + obj_val_primal_v.resize(batchsize_, stream); + obj_val_dual_v.resize(batchsize_, stream); d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); @@ -231,17 +233,16 @@ class LinearAssignmentProblem { int hungarianStep3() { int next; - raft::mr::device::buffer flag_v(handle_.get_device_allocator(), - handle_.get_stream(), 1); + rmm::device_scalar flag_v(handle_.get_stream()); bool h_flag = false; - raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream()); + flag_v.set_value_async(h_flag, handle_.get_stream()); detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, d_col_data_dev, flag_v.data(), batchsize_, size_, epsilon_); - raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream()); + h_flag = flag_v.value(handle_.get_stream()); next = h_flag ? 4 : 5; diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 7640f3f816..830940f0ec 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -29,7 +29,8 @@ #include #include #include -#include +#include +#include #include #include @@ -125,10 +126,8 @@ inline void computeInitialAssignments(raft::handle_t const &handle, std::size_t size = SP * N; - raft::mr::device::buffer row_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer col_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); + rmm::device_uvector row_lock_v(size, handle.get_stream()); + rmm::device_uvector col_lock_v(size, handle.get_stream()); thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1); thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1); @@ -214,25 +213,21 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, weight_t epsilon) { vertex_t M; - raft::mr::device::buffer csr_ptrs_v(handle.get_device_allocator(), - handle.get_stream(), 0); - raft::mr::device::buffer csr_neighbors_v( - handle.get_device_allocator(), handle.get_stream(), 0); + rmm::device_uvector csr_ptrs_v(0, handle.get_stream()); + rmm::device_uvector csr_neighbors_v(0, handle.get_stream()); { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v( - handle.get_device_allocator(), handle.get_stream(), SP * N); + rmm::device_uvector predicates_v(SP * N, handle.get_stream()); + rmm::device_uvector addresses_v(SP * N, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); - csr_ptrs_v.resize(SP + 1); + csr_ptrs_v.resize(SP + 1, handle.get_stream()); thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1}); @@ -251,7 +246,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, addresses_v.end(), addresses_v.begin()); if (M > 0) { - csr_neighbors_v.resize(M); + csr_neighbors_v.resize(M, handle.get_stream()); kernel_rowScatterCSR<<>>( @@ -300,10 +295,8 @@ inline void reversePass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), size); + rmm::device_uvector predicates_v(size, handle.get_stream()); + rmm::device_uvector addresses_v(size, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), size, false); thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); @@ -329,8 +322,7 @@ inline void reversePass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, csr_size); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), csr_size); + rmm::device_uvector elements_v(csr_size, handle.get_stream()); kernel_augmentScatter<<>>( @@ -358,10 +350,8 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); + rmm::device_uvector predicates_v(SP * N, handle.get_stream()); + rmm::device_uvector addresses_v(SP * N, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); @@ -388,8 +378,8 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size); + rmm::device_uvector elements_v(row_ids_csr_size, + handle.get_stream()); kernel_augmentScatter<<>>( @@ -418,8 +408,7 @@ inline void dualUpdate(raft::handle_t const &handle, dim3 threads_per_block; int total_blocks; - raft::mr::device::buffer sp_min_v(handle.get_device_allocator(), - handle.get_stream(), 1); + rmm::device_scalar sp_min_v(handle.get_stream()); raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index b5a93c4953..d6d064c20e 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -63,11 +63,11 @@ namespace linalg { * @code{.cpp} * // Initialize arrays * int ld_L = n_rows; - * device_buffer L(allocator, stream, ld_L * n_rows); + * rmm::device_uvector L(ld_L * n_rows, stream); * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr, * &n_bytes, CUBLAS_FILL_MODE_LOWER, * stream); - * device_buffer workspace(allocator, stream, n_bytes); + * rmm::device_uvector workspace(n_bytes, stream); * * for (n=1; n<=n_rows; rank++) { * // Calculate a new row/column of matrix A into A_new @@ -87,11 +87,11 @@ namespace linalg { * @code{.cpp} * // Initialize arrays * int ld_U = n_rows; - * device_buffer U(allocator, stream, ld_U * n_rows); + * rmm::device_uvector U(ld_U * n_rows, stream); * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr, * &n_bytes, CUBLAS_FILL_MODE_UPPER, * stream); - * device_buffer workspace(allocator, stream, n_bytes); + * rmm::device_uvector workspace(stream, n_bytes, stream); * * for (n=1; n<=n_rows; n++) { * // Calculate a new row/column of matrix A into array A_new diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 6172618380..5b2df3bcb3 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -22,7 +22,8 @@ #include #include #include -#include +#include +#include namespace raft { namespace linalg { @@ -44,7 +45,6 @@ template void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; @@ -52,8 +52,8 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, eig_vals, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer d_dev_info(allocator, stream, 1); + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar d_dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); @@ -63,9 +63,7 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, d_dev_info.data(), stream)); CUDA_CHECK(cudaGetLastError()); - int dev_info; - raft::update_host(&dev_info, d_dev_info.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + int dev_info = d_dev_info.value(stream); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); @@ -93,7 +91,6 @@ template void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, EigVecMemUsage memUsage, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; @@ -104,9 +101,9 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer d_dev_info(allocator, stream, 1); - raft::mr::device::buffer d_eig_vectors(allocator, stream, 0); + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar d_dev_info(stream); + rmm::device_uvector d_eig_vectors(0, stream); if (memUsage == OVERWRITE_INPUT) { CUSOLVER_CHECK(cusolverDnsyevdx( @@ -127,9 +124,7 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, CUDA_CHECK(cudaGetLastError()); - int dev_info; - raft::update_host(&dev_info, d_dev_info.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + int dev_info = d_dev_info.value(stream); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); @@ -163,7 +158,6 @@ template void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; @@ -176,8 +170,8 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer dev_info(allocator, stream, 1); + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index cb2e8ed1ab..9944685a1f 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace raft { namespace linalg { @@ -40,7 +41,7 @@ void range(T *out, int start, int end, cudaStream_t stream) { thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); - thrust::copy(thrust::cuda::par.on(stream), first, last, ptr); + thrust::copy(rmm::exec_policy(stream), first, last, ptr); } /** diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index cafa8d54f1..cc912d7d86 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -19,7 +19,8 @@ #include #include #include -#include +#include +#include namespace raft { namespace linalg { @@ -42,7 +43,6 @@ namespace linalg { template void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; @@ -50,14 +50,14 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); - raft::mr::device::buffer tau(allocator, stream, k); + rmm::device_uvector tau(k, stream); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); int Lwork; CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); - raft::mr::device::buffer workspace(allocator, stream, Lwork); + rmm::device_uvector workspace(Lwork, stream); CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); @@ -86,12 +86,11 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, template void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; - raft::mr::device::buffer R_full(allocator, stream, m * n); - raft::mr::device::buffer tau(allocator, stream, min(m, n)); + rmm::device_uvector R_full(m * n, stream); + rmm::device_uvector tau(min(m, n), stream); CUDA_CHECK( cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; @@ -99,12 +98,12 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, cudaMemcpyDeviceToDevice, stream)); int Lwork; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); - raft::mr::device::buffer workspace(allocator, stream, Lwork); + rmm::device_uvector workspace(Lwork, stream); CUSOLVER_CHECK(cusolverDngeqrf( cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 7357a68a4c..8b40a80903 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -23,7 +23,8 @@ #include #include #include -#include +#include +#include #include "eig.cuh" #include "gemm.cuh" #include "transpose.h" @@ -54,8 +55,6 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, bool trans_right, bool gen_left_vec, bool gen_right_vec, cudaStream_t stream) { - std::shared_ptr allocator = - handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); @@ -71,13 +70,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int m = n_rows; const int n = n_cols; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); T *d_rwork = nullptr; int lwork = 0; CUSOLVER_CHECK( cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); + rmm::device_uvector d_work(lwork, stream); char jobu = 'S'; char jobvt = 'A'; @@ -112,12 +111,11 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, template void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, T *U, T *V, bool gen_left_vec, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; - raft::mr::device::buffer in_cross_mult(allocator, stream, len); + rmm::device_uvector in_cross_mult(len, stream); T alpha = T(1); T beta = T(0); @@ -162,7 +160,6 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, math_t *sing_vals, math_t *left_sing_vecs, math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, math_t tol, int max_sweeps, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -174,7 +171,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int m = n_rows; int n = n_cols; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); int lwork = 0; int econ = 1; @@ -183,7 +180,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); - raft::mr::device::buffer d_work(allocator, stream, lwork); + rmm::device_uvector d_work(lwork, stream); CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, @@ -210,10 +207,8 @@ template void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, math_t *V, math_t *out, int n_rows, int n_cols, int k, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); - const math_t alpha = 1.0, beta = 0.0; - raft::mr::device::buffer SVT(allocator, stream, k * n_cols); + rmm::device_uvector SVT(k * n_cols, stream); raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); @@ -239,14 +234,13 @@ template bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, math_t *S_vec, math_t *V, int n_rows, int n_cols, int k, math_t tol, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; // form product matrix - raft::mr::device::buffer P_d(allocator, stream, m * n); - raft::mr::device::buffer S_mat(allocator, stream, k * k); + rmm::device_uvector P_d(m * n, stream); + rmm::device_uvector S_mat(k * k, stream); CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); @@ -262,7 +256,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, // calculate percent error const math_t alpha = 1.0, beta = -1.0; - raft::mr::device::buffer A_minus_P(allocator, stream, m * n); + rmm::device_uvector A_minus_P(m * n, stream); CUDA_CHECK( cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index d90f6271fa..db1cabd694 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include namespace raft { namespace linalg { @@ -60,7 +60,7 @@ void transpose(math_t *inout, int n, cudaStream_t stream) { auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) { int s_row = idx % m; int s_col = idx / m; diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index 0a72117140..41ca85dce0 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include namespace raft { namespace matrix { @@ -285,7 +285,6 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len, * @param src: input matrix * @param dest: output matrix. The result is stored in the dest matrix * @param len: number elements of input matrix - * @param allocator device allocator * @param stream cuda stream */ template @@ -294,10 +293,7 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, auto d_src = src; auto d_dest = dest; - std::shared_ptr allocator = - handle.get_device_allocator(); - - raft::mr::device::buffer d_sum(allocator, stream, 1); + rmm::device_scalar d_sum(stream); auto *d_sum_ptr = d_sum.data(); auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 5f5755e24e..688b92da09 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -20,13 +20,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include namespace raft { namespace matrix { @@ -64,7 +64,7 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { idx_t row = idx % n_rows_indices; idx_t col = idx / n_rows_indices; @@ -108,7 +108,7 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, auto d_q_trunc = out; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { idx_t row = idx % m; idx_t col = idx / m; @@ -133,8 +133,8 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { auto d_q_reversed = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { + thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), + [=] __device__(idx_t idx) { idx_t dest_row = idx % m; idx_t dest_col = idx / m; idx_t src_row = dest_row; @@ -161,8 +161,8 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { auto d_q_reversed = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { + thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), + [=] __device__(idx_t idx) { idx_t dest_row = idx % m; idx_t dest_col = idx / m; idx_t src_row = (m - dest_row) - 1; diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index 56710ea81f..3d2e44e49b 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -24,9 +24,8 @@ #include #include #include -#include -#include #include +#include #include #include "rng_impl.cuh" @@ -498,7 +497,6 @@ class Rng { * sampling is desired * @param sampledLen output sampled array length * @param len input array length - * @param allocator device allocator for allocating any workspace required * @param stream cuda stream */ template @@ -509,13 +507,10 @@ class Rng { ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); - std::shared_ptr allocator = - handle.get_device_allocator(); - - raft::mr::device::buffer expWts(allocator, stream, len); - raft::mr::device::buffer sortedWts(allocator, stream, len); - raft::mr::device::buffer inIdx(allocator, stream, len); - raft::mr::device::buffer outIdxBuff(allocator, stream, len); + rmm::device_uvector expWts(len, stream); + rmm::device_uvector sortedWts(len, stream); + rmm::device_uvector inIdx(len, stream); + rmm::device_uvector outIdxBuff(len, stream); auto *inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( @@ -533,7 +528,7 @@ class Rng { ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items IdxT *outIdxPtr = outIdxBuff.data(); - raft::mr::device::buffer workspace(allocator, stream); + rmm::device_uvector workspace(0, stream); sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index a034bdbda8..79b18ebd0a 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -22,8 +22,7 @@ #include #include #include -#include -#include +#include #include #include @@ -49,16 +48,15 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, int *dst_offsets, int *dstCols, value_t *dstVals) { auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); - auto d_alloc = handle.get_device_allocator(); - raft::mr::device::buffer dstRows(d_alloc, stream, nnz); + rmm::device_uvector dstRows(nnz, stream); CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); - raft::mr::device::buffer pBuffer(d_alloc, stream, buffSize); - raft::mr::device::buffer P(d_alloc, stream, nnz); + rmm::device_uvector pBuffer(buffSize, stream); + rmm::device_uvector P(nnz, stream); CUSPARSE_CHECK( cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), @@ -147,14 +145,12 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param nnz: size of COO rows array * @param row_ind: output row indices array * @param m: number of rows in dense matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, - std::shared_ptr d_alloc, cudaStream_t stream) { - raft::mr::device::buffer row_counts(d_alloc, stream, m); + rmm::device_uvector row_counts(m, stream); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); @@ -164,7 +160,7 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); } @@ -173,15 +169,11 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, * * @param coo: Input COO matrix * @param row_ind: output row indices array - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, - std::shared_ptr d_alloc, - cudaStream_t stream) { - sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, - stream); +void sorted_coo_to_csr(COO *coo, int *row_ind, cudaStream_t stream) { + sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream); } }; // end NAMESPACE convert diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 73120fea8c..fa21614f8f 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -17,13 +17,11 @@ #include #include #include -#include -#include +#include #include #include -#include #include #include @@ -58,9 +56,9 @@ namespace sparse { template class COO { protected: - raft::mr::device::buffer rows_arr; - raft::mr::device::buffer cols_arr; - raft::mr::device::buffer vals_arr; + rmm::device_uvector rows_arr; + rmm::device_uvector cols_arr; + rmm::device_uvector vals_arr; public: Index_Type nnz; @@ -68,13 +66,12 @@ class COO { Index_Type n_cols; /** - * @param d_alloc: the device allocator to use for the underlying buffers * @param stream: CUDA stream to use */ - COO(std::shared_ptr d_alloc, cudaStream_t stream) - : rows_arr(d_alloc, stream, 0), - cols_arr(d_alloc, stream, 0), - vals_arr(d_alloc, stream, 0), + COO(cudaStream_t stream) + : rows_arr(0, stream), + cols_arr(0, stream), + vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0) {} @@ -87,10 +84,9 @@ class COO { * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix */ - COO(raft::mr::device::buffer &rows, - raft::mr::device::buffer &cols, - raft::mr::device::buffer &vals, Index_Type nnz, Index_Type n_rows = 0, - Index_Type n_cols = 0) + COO(rmm::device_uvector &rows, + rmm::device_uvector &cols, rmm::device_uvector &vals, + Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0) : rows_arr(rows), cols_arr(cols), vals_arr(vals), @@ -99,19 +95,17 @@ class COO { n_cols(n_cols) {} /** - * @param d_alloc: the device allocator use * @param stream: CUDA stream to use * @param nnz: size of the rows/cols/vals arrays * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix * @param init: initialize arrays with zeros */ - COO(std::shared_ptr d_alloc, cudaStream_t stream, - Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, - bool init = true) - : rows_arr(d_alloc, stream, nnz), - cols_arr(d_alloc, stream, nnz), - vals_arr(d_alloc, stream, nnz), + COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0, + Index_Type n_cols = 0, bool init = true) + : rows_arr(nnz, stream), + cols_arr(nnz, stream), + vals_arr(nnz, stream), nnz(nnz), n_rows(n_rows), n_cols(n_cols) { diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index bc4a68d296..041aedf41c 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -20,8 +20,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -208,7 +208,6 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param row_ind_ptr the row index pointer of the CSR array * @param nnz the size of row_ind_ptr array * @param N number of vertices - * @param d_alloc: deviceAllocator to use for temp memory * @param stream the cuda stream to use * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) @@ -216,11 +215,8 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, template bool> void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, - std::shared_ptr d_alloc, - cudaStream_t stream, Lambda filter_op) { - raft::mr::device::buffer m(d_alloc, stream, 1); - + Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) { + rmm::device_scalar m(stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); @@ -245,15 +241,12 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, * @param row_ind_ptr the row index pointer of the CSR array * @param nnz the size of row_ind_ptr array * @param N number of vertices - * @param d_alloc: deviceAllocator to use for temp memory * @param stream the cuda stream to use */ template void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, - std::shared_ptr d_alloc, - cudaStream_t stream) { - raft::mr::device::buffer m(d_alloc, stream, 1); + Index_ nnz, Index_ N, cudaStream_t stream) { + rmm::device_scalar m(stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh index f3109556b7..6885c250c0 100644 --- a/cpp/include/raft/sparse/distance/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -21,13 +21,11 @@ #include #include #include -#include - -#include - #include #include +#include #include +#include #include @@ -84,7 +82,6 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, const value_t *Q_data, value_idx Q_nnz, const value_idx *R_coo_rows, const value_t *R_data, value_idx R_nnz, value_idx m, value_idx n, - std::shared_ptr alloc, cudaStream_t stream, expansion_f expansion_func) { rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); @@ -130,7 +127,7 @@ class jaccard_expanded_distances_t : public distances_t { compute_bin_distance( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t q_r_union = q_norm + r_norm; value_t denom = q_r_union - dot; @@ -179,7 +176,7 @@ class dice_expanded_distances_t : public distances_t { compute_bin_distance( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t q_r_union = q_norm + r_norm; value_t dice = (2 * dot) / q_r_union; diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh index 3a78f9ada0..24be171900 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include "../csr.cuh" @@ -35,8 +34,6 @@ #include -#include - namespace raft { namespace sparse { namespace distance { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh index 5ace978a23..3b57225350 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh @@ -22,8 +22,6 @@ #include "coo_mask_row_iterators.cuh" #include -#include -#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh index 44c3833f96..74eb37bc2b 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -20,7 +20,6 @@ #include "../utils.cuh" #include -#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh index 1295d24103..a95c6ff85b 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh @@ -55,7 +55,7 @@ class hash_strategy : public coo_spmv_strategy { rmm::device_uvector &mask_indptr, std::tuple &n_rows_divided, cudaStream_t stream) { - auto policy = rmm::exec_policy(stream); + auto policy = this->config.handle.get_thrust_policy(); auto less = thrust::copy_if( policy, thrust::make_counting_iterator(value_idx(0)), diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index a1974b3666..03df396b2e 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh index 882ccba027..b1e2756671 100644 --- a/cpp/include/raft/sparse/distance/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -22,8 +22,6 @@ #include #include -#include - #include #include #include @@ -31,6 +29,7 @@ #include #include #include +#include #include diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index 8886d4c9df..6ccfd4adcb 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -21,17 +21,13 @@ #include #include #include +#include +#include #include #include -#include - -#include - -#include #include - -#include #include +#include #include @@ -127,9 +123,8 @@ template alloc, - cudaStream_t stream, expansion_f expansion_func) { + value_idx R_nnz, value_idx m, value_idx n, cudaStream_t stream, + expansion_f expansion_func) { rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); CUDA_CHECK( @@ -161,7 +156,6 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, const value_t *Q_data, value_idx Q_nnz, const value_idx *R_coo_rows, const value_t *R_data, value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols, - std::shared_ptr alloc, cudaStream_t stream) { // sum_sq for std dev rmm::device_uvector Q_sq_norms(m, stream); @@ -221,7 +215,7 @@ class l2_expanded_distances_t : public distances_t { compute_l2( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { return -2 * dot + q_norm + r_norm; }); @@ -283,7 +277,6 @@ class correlation_expanded_distances_t : public distances_t { compute_corr(out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, config_->b_ncols, - config_->handle.get_device_allocator(), config_->handle.get_stream()); } @@ -322,7 +315,7 @@ class cosine_expanded_distances_t : public distances_t { compute_l2( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t norms = sqrt(q_norm) * sqrt(r_norm); // deal with potential for 0 in denominator by forcing 0/1 instead diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh index 6b6d77a2d5..3bee1bc87d 100644 --- a/cpp/include/raft/sparse/distance/utils.cuh +++ b/cpp/include/raft/sparse/distance/utils.cuh @@ -21,8 +21,6 @@ #include -#include - namespace raft { namespace sparse { namespace distance { diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 3cffa1c28a..4ef2ac43e2 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -100,9 +99,8 @@ class UnionFind { template void build_dendrogram_host(const handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *data, - std::size_t nnz, value_idx *children, - value_t *out_delta, value_idx *out_size) { - auto d_alloc = handle.get_device_allocator(); + size_t nnz, value_idx *children, value_t *out_delta, + value_idx *out_size) { auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -225,11 +223,10 @@ struct init_label_roots { */ template void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, - const value_idx *children, - std::size_t n_clusters, std::size_t n_leaves) { - auto d_alloc = handle.get_device_allocator(); + const value_idx *children, size_t n_clusters, + size_t n_leaves) { auto stream = handle.get_stream(); - auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // Handle special case where n_clusters == 1 if (n_clusters == 1) { diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 7cf959dda6..31e4a0f263 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -60,12 +59,11 @@ struct distance_graph_impl &indptr, rmm::device_uvector &indices, rmm::device_uvector &data, int c) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); - auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // Need to symmetrize knn into undirected graph - raft::sparse::COO knn_graph_coo(d_alloc, stream); + raft::sparse::COO knn_graph_coo(stream); raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); @@ -78,7 +76,7 @@ struct distance_graph_impl &tup) { bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); @@ -86,9 +84,8 @@ struct distance_graph_impl(tup)); }); - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream); // TODO: Wouldn't need to copy here if we could compute knn // graph directly on the device uvectors diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 765a5ad77f..6ef6f9879b 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -25,12 +25,9 @@ #include #include -#include - #include #include #include -#include namespace raft { namespace hierarchy { @@ -80,18 +77,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, red_op reduction_op, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); - raft::sparse::COO connected_edges(d_alloc, stream); + raft::sparse::COO connected_edges(stream); raft::linkage::connect_components( handle, connected_edges, X, color, m, n, reduction_op); rmm::device_uvector indptr2(m + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(), - connected_edges.nnz, indptr2.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream); // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process @@ -136,7 +131,6 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, int max_iter = 10) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. @@ -145,7 +139,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, true); int iters = 1; - int n_components = linkage::get_n_components(color, m, d_alloc, stream); + int n_components = linkage::get_n_components(color, m, stream); while (n_components > 1 && iters < max_iter) { connect_knn_graph(handle, X, mst_coo, m, n, color, @@ -153,7 +147,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, iters++; - n_components = linkage::get_n_components(color, m, d_alloc, stream); + n_components = linkage::get_n_components(color, m, stream); } /** diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 01a033945c..06fffb8aed 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -58,7 +58,6 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, "n_clusters must be less than or equal to the number of data points"); auto stream = handle.get_stream(); - auto d_alloc = handle.get_device_allocator(); rmm::device_uvector indptr(EMPTY, stream); rmm::device_uvector indices(EMPTY, stream); diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 47b1ba6e41..7ed627b9e2 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -156,19 +156,17 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, * @param nnz2: size of right hand index_ptr and val arrays * @param m: size of output array (number of rows in final matrix) * @param out_ind: output row_ind array - * @param d_alloc: device allocator to use for temp memory * @param stream: cuda stream to use */ template size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, int *out_ind, - std::shared_ptr d_alloc, cudaStream_t stream) { dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - raft::mr::device::buffer row_counts(d_alloc, stream, m + 1); + rmm::device_uvector row_counts(m + 1, stream); CUDA_CHECK( cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); @@ -184,7 +182,7 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); return cnnz; diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 15302f3b74..ce0c4bbe6f 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -18,9 +18,8 @@ #include #include -#include -#include #include +#include #include #include @@ -35,16 +34,15 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, int nnz, int n, int n_components, T *out, unsigned long long seed = 1234567) { auto stream = handle.get_stream(); - auto d_alloc = handle.get_device_allocator(); - raft::mr::device::buffer src_offsets(d_alloc, stream, n + 1); - raft::mr::device::buffer dst_cols(d_alloc, stream, nnz); - raft::mr::device::buffer dst_vals(d_alloc, stream, nnz); + rmm::device_uvector src_offsets(n + 1, stream); + rmm::device_uvector dst_cols(nnz, stream); + rmm::device_uvector dst_vals(nnz, stream); convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); - raft::mr::device::buffer eigVals(d_alloc, stream, n_components + 1); - raft::mr::device::buffer eigVecs(d_alloc, stream, n * (n_components + 1)); - raft::mr::device::buffer labels(d_alloc, stream, n); + rmm::device_uvector eigVals(n_components + 1, stream); + rmm::device_uvector eigVecs(n * (n_components + 1), stream); + rmm::device_uvector labels(n, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -65,8 +63,6 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, index_type maxiter = 4000; //default reset value (when set to 0); value_type tol = 0.01; index_type restart_iter = 15 + neigvs; //what cugraph is using - auto t_exe_p = thrust::cuda::par.on(stream); - using thrust_exe_policy_t = decltype(t_exe_p); raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; @@ -85,15 +81,14 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, using value_type_t = value_type; std::pair solve( - handle_t const &handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, + handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim, value_type_t const *__restrict__ obs, index_type_t *__restrict__ codes) const { return std::make_pair(0, 0); } }; - raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver, + raft::spectral::partition(handle, r_csr_m, eig_solver, no_op_cluster_solver_t{}, labels.data(), eigVals.data(), eigVecs.data()); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 5c2c78f0c3..a6e1027288 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -31,8 +31,6 @@ #include #include -#include -#include #include #include @@ -122,22 +120,20 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, * @param in: Input COO matrix * @param out: Output symmetrized COO matrix * @param reduction_op: a custom reduction function - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void coo_symmetrize(COO *in, COO *out, Lambda reduction_op, // two-argument reducer - std::shared_ptr d_alloc, cudaStream_t stream) { dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); ASSERT(!out->validate_mem(), "Expecting unallocated COO for output"); - raft::mr::device::buffer in_row_ind(d_alloc, stream, in->n_rows); + rmm::device_uvector in_row_ind(in->n_rows, stream); - convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream); + convert::sorted_coo_to_csr(in, in_row_ind.data(), stream); out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); @@ -250,14 +246,14 @@ __global__ static void symmetric_sum(value_idx *restrict edges, * @param k: Number of n_neighbors * @param out: Output COO Matrix class * @param stream: Input cuda stream - * @param d_alloc device allocator for temporary buffers */ template -void from_knn_symmetrize_matrix( - const value_idx *restrict knn_indices, const value_t *restrict knn_dists, - const value_idx n, const int k, COO *out, - cudaStream_t stream, std::shared_ptr d_alloc) { +void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, + const value_t *restrict knn_dists, + const value_idx n, const int k, + COO *out, + cudaStream_t stream) { // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); @@ -265,11 +261,11 @@ void from_knn_symmetrize_matrix( raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) - raft::mr::device::buffer row_sizes(d_alloc, stream, n); + rmm::device_uvector row_sizes(n, stream); CUDA_CHECK( cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); - raft::mr::device::buffer row_sizes2(d_alloc, stream, n); + rmm::device_uvector row_sizes2(n, stream); CUDA_CHECK( cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); @@ -298,8 +294,8 @@ void from_knn_symmetrize_matrix( thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, - __row_sizes + n, __edges); + thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, + __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( @@ -314,7 +310,6 @@ template void symmetrize(const raft::handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *vals, size_t m, size_t n, size_t nnz, raft::sparse::COO &out) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); // copy rows to cols and cols to rows @@ -333,7 +328,7 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows, // sort COO raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, symm_rows.data(), symm_cols.data(), - symm_vals.data(), d_alloc, stream); + symm_vals.data(), stream); raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 6afe4ca8f6..7ad4b93ec0 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -21,8 +21,7 @@ #include #include #include -#include -#include +#include #include #include @@ -53,7 +52,6 @@ namespace linalg { * @param[in] csr_nrows : Number of rows in CSR * @param[in] csr_ncols : Number of columns in CSR * @param[in] nnz : Number of nonzeros of CSR - * @param[in] allocator : Allocator for intermediate memory * @param[in] stream : Cuda stream for ordering events */ template @@ -61,9 +59,7 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, const value_idx *csr_indices, const value_t *csr_data, value_idx *csc_indptr, value_idx *csc_indices, value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, - value_idx nnz, - std::shared_ptr allocator, - cudaStream_t stream) { + value_idx nnz, cudaStream_t stream) { size_t convert_csc_workspace_size = 0; CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( @@ -72,8 +68,8 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &convert_csc_workspace_size, stream)); - raft::mr::device::buffer convert_csc_workspace( - allocator, stream, convert_csc_workspace_size); + rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, + stream); CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index c5ba4fcb4f..33b980afcd 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -23,11 +23,10 @@ #include "utils.cuh" #include -#include -#include +#include +#include #include -#include #include #include #include @@ -35,11 +34,6 @@ #include #include -#include - -#include -#include - namespace raft { namespace mst { typedef std::chrono::high_resolution_clock Clock; @@ -65,20 +59,20 @@ MST_solver::MST_solver( offsets(offsets_), indices(indices_), weights(weights_), - altered_weights(e_), + altered_weights(e_, stream_), v(v_), e(e_), color_index(color_), - color(v_), - next_color(v_), - min_edge_color(v_), - new_mst_edge(v_), - mst_edge(e_, false), - temp_src(2 * v_), - temp_dst(2 * v_), - temp_weights(2 * v_), - mst_edge_count(1, 0), - prev_mst_edge_count(1, 0), + color(v_, stream_), + next_color(v_, stream_), + min_edge_color(v_, stream_), + new_mst_edge(v_, stream_), + mst_edge(e_, stream_), + temp_src(2 * v_, stream_), + temp_dst(2 * v_, stream_), + temp_weights(2 * v_, stream_), + mst_edge_count(1, stream_), + prev_mst_edge_count(1, stream_), stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), @@ -87,13 +81,18 @@ MST_solver::MST_solver( max_threads = handle_.get_device_properties().maxThreadsPerBlock; sm_count = handle_.get_device_properties().multiProcessorCount; + mst_edge_count.set_value_to_zero_async(stream); + prev_mst_edge_count.set_value_to_zero_async(stream); + CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), + stream)); + //Initially, color holds the vertex id as color - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); thrust::sequence(policy, color_index, color_index + v, 0); } else { - raft::copy(color.data().get(), color_index, v, stream); + raft::copy(color.data(), color_index, v, stream); } thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } @@ -160,12 +159,12 @@ MST_solver::solve() { timer3 += duration_us(stop - start); #endif - auto curr_mst_edge_count = mst_edge_count[0]; + auto curr_mst_edge_count = mst_edge_count.value(stream); RAFT_EXPECTS(curr_mst_edge_count <= max_mst_edges, "Number of edges found by MST is invalid. This may be due to " "loss in precision. Try increasing precision of weights."); - if (curr_mst_edge_count == prev_mst_edge_count[0]) { + if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 @@ -196,12 +195,11 @@ MST_solver::solve() { #endif // copy this iteration's results and store - prev_mst_edge_count = mst_edge_count; + prev_mst_edge_count.set_value_async(curr_mst_edge_count, stream); } // result packaging - thrust::host_vector host_mst_edge_count = mst_edge_count; - mst_result.n_edges = host_mst_edge_count[0]; + mst_result.n_edges = mst_edge_count.value(stream); mst_result.src.resize(mst_result.n_edges, stream); mst_result.dst.resize(mst_result.n_edges, stream); mst_result.weights.resize(mst_result.n_edges, stream); @@ -227,8 +225,8 @@ template alteration_t MST_solver::alteration_max() { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - rmm::device_vector tmp(e); + auto policy = handle.get_thrust_policy(); + rmm::device_uvector tmp(e, stream); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); //sort tmp weights @@ -242,7 +240,7 @@ MST_solver::alteration_max() { thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp[1] - tmp[0]; + auto init = tmp.element(1, stream) - tmp.element(0, stream); auto max = thrust::transform_reduce(policy, begin, end, alteration_functor(), init, thrust::minimum()); @@ -261,7 +259,7 @@ void MST_solver::alteration() { alteration_t max = alteration_max(); // pool of rand values - rmm::device_vector rand_values(v); + rmm::device_uvector rand_values(v, stream); // Random number generator curandGenerator_t randGen; @@ -269,8 +267,7 @@ void MST_solver::alteration() { curandSetPseudoRandomGeneratorSeed(randGen, 1234567); // Initialize rand values - auto curand_status = - curand_generate_uniformX(randGen, rand_values.data().get(), v); + auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, @@ -278,8 +275,8 @@ void MST_solver::alteration() { //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data().get(), - altered_weights.data().get()); + v, e, offsets, indices, weights, max, rand_values.data(), + altered_weights.data()); } // updates colors of vertices by propagating the lower color to the higher @@ -288,23 +285,24 @@ template ::label_prop( vertex_t* mst_src, vertex_t* mst_dst) { // update the colors of both ends its until there is no change in colors - thrust::host_vector curr_mst_edge_count = mst_edge_count; + edge_t curr_mst_edge_count = mst_edge_count.value(stream); auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); auto min_pair_nblocks = std::min( (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); - rmm::device_vector done(1, false); - - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - vertex_t* color_ptr = color.data().get(); - vertex_t* next_color_ptr = next_color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + vertex_t* color_ptr = color.data(); + vertex_t* next_color_ptr = next_color.data(); - bool* done_ptr = done.data().get(); + rmm::device_scalar done(stream); + done.set_value_to_zero_async(stream); + bool* done_ptr = done.data(); + const bool true_val = true; auto i = 0; - while (!done[0]) { - done[0] = true; + while (!done.value(stream)) { + done.set_value_async(true_val, stream); detail::min_pair_colors<<>>( v, indices, new_mst_edge_ptr, color_ptr, color_index, next_color_ptr); @@ -327,7 +325,7 @@ template void MST_solver::min_edge_per_vertex() { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), @@ -335,11 +333,11 @@ void MST_solver>>( offsets, indices, altered_weights_ptr, color_ptr, color_index, @@ -354,18 +352,18 @@ void MST_solver::max()); - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); - alteration_t* altered_weights_ptr = altered_weights.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); - vertex_t* temp_dst_ptr = temp_dst.data().get(); - weight_t* temp_weights_ptr = temp_weights.data().get(); + vertex_t* color_ptr = color.data(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + bool* mst_edge_ptr = mst_edge.data(); + alteration_t* min_edge_color_ptr = min_edge_color.data(); + alteration_t* altered_weights_ptr = altered_weights.data(); + vertex_t* temp_src_ptr = temp_src.data(); + vertex_t* temp_dst_ptr = temp_dst.data(); + weight_t* temp_weights_ptr = temp_weights.data(); detail::min_edge_per_supervertex<<>>( color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, @@ -390,8 +388,8 @@ void MST_solver::check_termination() { std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges - edge_t* mst_edge_count_ptr = mst_edge_count.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); + edge_t* mst_edge_count_ptr = mst_edge_count.data(); + vertex_t* temp_src_ptr = temp_src.data(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -411,9 +409,9 @@ template void MST_solver::append_src_dst_pair( vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); - auto curr_mst_edge_count = prev_mst_edge_count[0]; + edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream); // iterator to end of mst edges added to final output in previous iteration auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 8f755de459..4d5ca6ebe1 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -18,7 +18,7 @@ #pragma once #include -#include +#include #define MST_TIME namespace raft { @@ -32,7 +32,7 @@ __device__ idx_t get_1D_idx() { // somewhat smart vector print template -void printv(rmm::device_vector& vec, const std::string& name = "", +void printv(rmm::device_uvector& vec, const std::string& name = "", const size_t displ = 5) { #ifdef MST_TIME std::cout.precision(15); diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index 833882ea0d..44b34ee5c7 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -18,8 +18,8 @@ #pragma once #include +#include #include -#include namespace raft { @@ -68,24 +68,24 @@ class MST_solver { vertex_t sm_count; vertex_t* color_index; // represent each supervertex as a color - rmm::device_vector + rmm::device_uvector min_edge_color; // minimum incident edge weight per color - rmm::device_vector new_mst_edge; // new minimum edge per vertex - rmm::device_vector + rmm::device_uvector new_mst_edge; // new minimum edge per vertex + rmm::device_uvector altered_weights; // weights to be used for mst - rmm::device_vector + rmm::device_scalar mst_edge_count; // total number of edges added after every iteration - rmm::device_vector + rmm::device_scalar prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_vector + rmm::device_uvector mst_edge; // mst output - true if the edge belongs in mst - rmm::device_vector next_color; // next iteration color - rmm::device_vector color; // index of color that vertex points to + rmm::device_uvector next_color; // next iteration color + rmm::device_uvector color; // index of color that vertex points to // new src-dst pairs found per iteration - rmm::device_vector temp_src; - rmm::device_vector temp_dst; - rmm::device_vector temp_weights; + rmm::device_uvector temp_src; + rmm::device_uvector temp_dst; + rmm::device_uvector temp_weights; void label_prop(vertex_t* mst_src, vertex_t* mst_dst); void min_edge_per_vertex(); diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 562d506cfe..492058f85f 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -84,11 +84,9 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, template void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, int *crows, int *ccols, T *cvals, int *cnnz, - int *cur_cnnz, T scalar, int n, - std::shared_ptr d_alloc, - cudaStream_t stream) { - raft::mr::device::buffer ex_scan(d_alloc, stream, n); - raft::mr::device::buffer cur_ex_scan(d_alloc, stream, n); + int *cur_cnnz, T scalar, int n, cudaStream_t stream) { + rmm::device_uvector ex_scan(n, stream); + rmm::device_uvector cur_ex_scan(n, stream); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); @@ -96,14 +94,14 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz, + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); @@ -122,15 +120,12 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, * @param in: input COO matrix * @param out: output COO matrix * @param scalar: scalar to remove from arrays - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, COO *out, T scalar, - std::shared_ptr d_alloc, - cudaStream_t stream) { - raft::mr::device::buffer row_count_nz(d_alloc, stream, in->n_rows); - raft::mr::device::buffer row_count(d_alloc, stream, in->n_rows); +void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { + rmm::device_uvector row_count_nz(in->n_rows, stream); + rmm::device_uvector row_count(in->n_rows, stream); CUDA_CHECK( cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); @@ -146,7 +141,7 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, + int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); @@ -154,7 +149,7 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, out->rows(), out->cols(), out->vals(), row_count_nz.data(), row_count.data(), scalar, - in->n_rows, d_alloc, stream); + in->n_rows, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -163,14 +158,11 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, * * @param in: input COO matrix * @param out: output COO matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, COO *out, - std::shared_ptr d_alloc, - cudaStream_t stream) { - coo_remove_scalar(in, out, T(0.0), d_alloc, stream); +void coo_remove_zeros(COO *in, COO *out, cudaStream_t stream) { + coo_remove_scalar(in, out, T(0.0), stream); } }; // namespace op diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 53c9f89074..09a35720fb 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -32,7 +31,6 @@ #include #include #include -#include #include #include @@ -126,18 +124,16 @@ void max_duplicates(const raft::handle_t &handle, raft::sparse::COO &out, const value_idx *rows, const value_idx *cols, const value_t *vals, size_t nnz, size_t m, size_t n) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); - - auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // compute diffs & take exclusive scan rmm::device_uvector diff(nnz + 1, stream); compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(), - diff.data() + diff.size(), diff.data()); + thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), + diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index 09d5b568be..c40801a0b1 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -20,9 +20,8 @@ #include #include #include -#include -#include #include +#include #include #include @@ -59,35 +58,28 @@ struct TupleComp { * @param rows rows array from coo matrix * @param cols cols array from coo matrix * @param vals vals array from coo matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, - // TODO: Remove this - std::shared_ptr d_alloc, cudaStream_t stream) { auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices, - coo_indices + nnz, vals, TupleComp()); + thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, + vals, TupleComp()); } /** * @brief Sort the underlying COO arrays by row * @tparam T: the type name of the underlying value array * @param in: COO to sort by row - * @param d_alloc device allocator for temporary buffers * @param stream: the cuda stream to use */ template -void coo_sort(COO *const in, - // TODO: Remove this - std::shared_ptr d_alloc, - cudaStream_t stream) { +void coo_sort(COO *const in, cudaStream_t stream) { coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), - in->vals(), d_alloc, stream); + in->vals(), stream); } /** @@ -107,8 +99,7 @@ void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, - first); + thrust::sort_by_key(rmm::exec_policy(stream), t_data, t_data + nnz, first); } }; // namespace op }; // end NAMESPACE sparse diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 8aae90f1d8..46369ca964 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -159,14 +159,10 @@ struct CubKVPMinReduce { */ template value_idx get_n_components(value_idx *colors, size_t n_rows, - std::shared_ptr d_alloc, cudaStream_t stream) { - value_idx *map_ids; - int num_clusters; - raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, - d_alloc); - d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream); - + rmm::device_uvector map_ids(0, stream); + int num_clusters = + raft::label::getUniquelabels(map_ids, colors, n_rows, stream); return num_clusters; } @@ -197,15 +193,13 @@ struct LookupColorOp { * @param[in] X original dense data * @param[in] n_rows number of rows in original dense data * @param[in] n_cols number of columns in original dense data - * @param[in] d_alloc device allocator to use * @param[in] stream cuda stream for which to order cuda operations */ template void perform_1nn(cub::KeyValuePair *kvp, value_idx *nn_colors, value_idx *colors, const value_t *X, - size_t n_rows, size_t n_cols, - std::shared_ptr d_alloc, - cudaStream_t stream, red_op reduction_op) { + size_t n_rows, size_t n_cols, cudaStream_t stream, + red_op reduction_op) { rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); @@ -218,7 +212,7 @@ void perform_1nn(cub::KeyValuePair *kvp, workspace.data(), reduction_op, reduction_op, true, true, stream); LookupColorOp extract_colors_op(colors); - thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, + thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); } @@ -239,15 +233,15 @@ void sort_by_color(value_idx *colors, value_idx *nn_colors, cub::KeyValuePair *kvp, value_idx *src_indices, size_t n_rows, cudaStream_t stream) { thrust::counting_iterator arg_sort_iter(0); - thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, - arg_sort_iter + n_rows, src_indices); + thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, + src_indices); auto keys = thrust::make_zip_iterator(thrust::make_tuple( colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, + thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp()); } @@ -324,7 +318,6 @@ void connect_components(const raft::handle_t &handle, size_t n_rows, size_t n_cols, red_op reduction_op, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, @@ -336,10 +329,9 @@ void connect_components(const raft::handle_t &handle, // Normalize colors so they are drawn from a monotonically increasing set raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, - d_alloc, true); + true); - value_idx n_components = - get_n_components(colors.data(), n_rows, d_alloc, stream); + value_idx n_components = get_n_components(colors.data(), n_rows, stream); /** * First compute 1-nn for all colors where the color of each data point @@ -351,7 +343,7 @@ void connect_components(const raft::handle_t &handle, rmm::device_uvector src_indices(n_rows, stream); perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, - n_rows, n_cols, d_alloc, stream, reduction_op); + n_rows, n_cols, stream, reduction_op); /** * Sort data points by color (neighbors are not sorted) @@ -369,7 +361,7 @@ void connect_components(const raft::handle_t &handle, raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(), + thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(), out_index.data() + out_index.size(), out_index.data()); // compute final size @@ -380,7 +372,7 @@ void connect_components(const raft::handle_t &handle, size++; - raft::sparse::COO min_edges(d_alloc, stream); + raft::sparse::COO min_edges(stream); min_edges.allocate(size, n_rows, n_rows, true, stream); min_components_by_color(min_edges, out_index.data(), src_indices.data(), diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index 71fbb8ab3d..3566939bc4 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -415,7 +414,6 @@ class sparse_knn_t { * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) * @param[in] k the number of neighbors to query * @param[in] cusparseHandle the initialized cusparseHandle instance to use - * @param[in] allocator device allocator instance to use * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to * @param[in] batch_size_index maximum number of rows to use from index matrix per batch * @param[in] batch_size_query maximum number of rows to use from query matrix per batch diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 1cf225087a..1cdd66f516 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -96,7 +96,6 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, raft::sparse::COO &out, int c = 15) { int k = build_k(m, c); - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); size_t nnz = m * k; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 77d7831b4a..2cdf9bf4f5 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -22,15 +22,12 @@ #include #include -#include #include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - /** * @brief Flat C++ API function to build an approximate nearest neighbors index * from an index array and a set of parameters. diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 6e4c99b646..77ad4afe96 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -39,14 +39,11 @@ #include #include -#include #include #include -#include #include -#include #include #include @@ -145,8 +142,7 @@ void approx_knn_build_index(raft::handle_t &handle, // perform preprocessing // k set to 0 (unused during preprocessing / revertion) std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, 0, false, handle.get_stream(), - handle.get_device_allocator()); + create_processor(metric, n, D, 0, false, handle.get_stream()); query_metric_processor->preprocess(index_array); @@ -183,7 +179,7 @@ void approx_knn_search(raft::handle_t &handle, float *distances, // perform preprocessing std::unique_ptr> query_metric_processor = create_processor(index->metric, n, index->index->d, k, false, - handle.get_stream(), handle.get_device_allocator()); + handle.get_stream()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 09494e9eb1..84c130b0e4 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include #include #include @@ -27,7 +29,6 @@ #include #include -#include #include #include #include @@ -179,7 +180,6 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[out] res_I pointer to device memory for returning k nearest indices * @param[out] res_D pointer to device memory for returning k nearest distances * @param[in] k number of neighbors to query - * @param[in] allocator the device memory allocator to use for temporary scratch memory * @param[in] userStream the main cuda stream to use * @param[in] internalStreams optional when n_params > 0, the index partitions can be * queried in parallel using these streams. Note that n_int_streams also @@ -198,7 +198,6 @@ template void brute_force_knn_impl(std::vector &input, std::vector &sizes, IntType D, float *search_items, IntType n, int64_t *res_I, float *res_D, IntType k, - std::shared_ptr allocator, cudaStream_t userStream, cudaStream_t *internalStreams = nullptr, int n_int_streams = 0, bool rowMajorIndex = true, @@ -228,28 +227,26 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, k, rowMajorQuery, userStream, - allocator); + create_processor(metric, n, D, k, rowMajorQuery, userStream); query_metric_processor->preprocess(search_items); std::vector>> metric_processors( input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor( - metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i] = create_processor(metric, sizes[i], D, k, + rowMajorQuery, userStream); metric_processors[i]->preprocess(input[i]); } int device; CUDA_CHECK(cudaGetDevice(&device)); - raft::mr::device::buffer trans(allocator, userStream, - id_ranges->size()); + rmm::device_uvector trans(id_ranges->size(), userStream); raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); - raft::mr::device::buffer all_D(allocator, userStream, 0); - raft::mr::device::buffer all_I(allocator, userStream, 0); + rmm::device_uvector all_D(0, userStream); + rmm::device_uvector all_I(0, userStream); float *out_D = res_D; int64_t *out_I = res_I; diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index a645412c2f..876e91e877 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -19,16 +19,14 @@ #include #include #include -#include -#include #include #include +#include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; /** * @brief A virtual class defining pre- and post-processing * for metrics. This class will temporarily modify its given @@ -56,16 +54,13 @@ class CosineMetricProcessor : public MetricProcessor { size_t n_rows_; size_t n_cols_; cudaStream_t stream_; - std::shared_ptr device_allocator_; - raft::mr::device::buffer colsums_; + rmm::device_uvector colsums_; public: CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, - cudaStream_t stream, - std::shared_ptr allocator) - : device_allocator_(allocator), - stream_(stream), - colsums_(allocator, stream, n_rows), + cudaStream_t stream) + : stream_(stream), + colsums_(n_rows, stream), n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), @@ -104,11 +99,9 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { public: CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, - bool row_major, cudaStream_t stream, - std::shared_ptr allocator) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, - allocator), - means_(allocator, stream, n_rows) {} + bool row_major, cudaStream_t stream) + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream), + means_(n_rows, stream) {} void preprocess(math_t *data) { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; @@ -143,7 +136,7 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { ~CorrelationMetricProcessor() = default; - raft::mr::device::buffer means_; + rmm::device_uvector means_; }; template @@ -161,18 +154,18 @@ class DefaultMetricProcessor : public MetricProcessor { template inline std::unique_ptr> create_processor( distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, - cudaStream_t userStream, std::shared_ptr allocator) { + cudaStream_t userStream) { MetricProcessor *mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, - allocator); + mp = + new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream); break; case distance::DistanceType::CorrelationExpanded: mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, - userStream, allocator); + userStream); break; default: mp = new DefaultMetricProcessor(); diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index a3a1972c13..71c547c281 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -18,15 +18,12 @@ #include "detail/knn_brute_force_faiss.cuh" -#include #include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - template inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, value_idx *outV, size_t n_samples, int n_parts, @@ -72,8 +69,7 @@ inline void brute_force_knn( std::vector int_streams = handle.get_internal_streams(); detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, - k, handle.get_device_allocator(), - handle.get_stream(), int_streams.data(), + k, handle.get_stream(), int_streams.data(), handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, translations, metric, metric_arg); } diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 922ae7cfab..6f507331d9 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -42,19 +42,16 @@ struct kmeans_solver_t { size_type_t> const& config) : config_(config) {} - template std::pair solve( - handle_t const& handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, + handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, - config_.tol, config_.maxIter, obs, codes, residual, iters, - config_.seed); + kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol, + config_.maxIter, obs, codes, residual, iters, config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index fb05bff3e2..b6f0105487 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -325,7 +324,6 @@ static __global__ void divideCentroids( * Centroid is randomly chosen with k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -341,12 +339,9 @@ static __global__ void divideCentroids( * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template -static int chooseNewCentroid(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, - value_type_t rand, +template +static int chooseNewCentroid(handle_t const& handle, index_type_t n, + index_type_t d, index_type_t k, value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, value_type_t* __restrict__ centroid) { @@ -357,8 +352,9 @@ static int chooseNewCentroid(handle_t const& handle, // Observation vector that is chosen as new centroid index_type_t obsIndex; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Compute cumulative sum of distances thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), @@ -417,10 +413,7 @@ static int chooseNewCentroid(handle_t const& handle, * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -439,14 +432,12 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template +template static int initializeCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, - unsigned long long seed) { + handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -458,8 +449,9 @@ static int initializeCentroids( thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); constexpr index_type_t grid_lower_bound{65535}; @@ -486,8 +478,8 @@ static int initializeCentroids( thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); CHECK_CUDA(stream); - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids)) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, + centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid @@ -499,8 +491,8 @@ static int initializeCentroids( // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, + centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -529,10 +521,7 @@ static int initializeCentroids( * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -553,16 +542,18 @@ static int initializeCentroids( * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) { - auto cublas_h = handle.get_cublas_handle(); +template +static int assignCentroids(handle_t const& handle, index_type_t n, + index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) { auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); @@ -606,10 +597,7 @@ static int assignCentroids( * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -628,10 +616,8 @@ static int assignCentroids( * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template -static int updateCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, index_type_t n, +template +static int updateCentroids(handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, @@ -649,8 +635,9 @@ static int updateCentroids(handle_t const& handle, constexpr index_type_t grid_lower_bound{65535}; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Device memory thrust::device_ptr obs_copy(work); @@ -722,10 +709,7 @@ namespace raft { * k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -754,11 +738,10 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, index_type_t n, index_type_t d, + index_type_t k, value_type_t tol, index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, @@ -785,16 +768,17 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Initialization // ------------------------------------------------------- - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, + work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; @@ -840,21 +824,21 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - codes, clusterSizes, work, seed)) + if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, + work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, + work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, + clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids @@ -868,12 +852,11 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, - uniformDist(rng), obs, work, + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, + clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); emptyCentroid = (thrust::find(thrust_exec_policy, @@ -905,10 +888,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -926,11 +906,10 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, index_type_t n, index_type_t d, + index_type_t k, value_type_t tol, index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, value_type_t& residual, index_type_t& iters, unsigned long long seed = 123456) { using namespace matrix; @@ -950,9 +929,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Perform k-means return kmeans( - handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, - clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, - &iters, seed); + handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(), + centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index c43154d17a..42fc621a1a 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -72,52 +73,30 @@ struct vector_view_t { : buffer_(buffer), size_(sz) {} vector_view_t(vector_view_t&& other) - : buffer_(other.buffer_), size_(other.size_) { - other.buffer_ = nullptr; - other.size_ = 0; - } + : buffer_(other.raw()), size_(other.size()) {} vector_view_t& operator=(vector_view_t&& other) { - buffer_ = other.buffer_; - size_ = other.size_; - - other.buffer_ = nullptr; - other.size_ = 0; + buffer_ = other.raw(); + size_ = other.size(); } }; -// allocatable vector, using raft handle allocator -// template class vector_t { - handle_t const& handle_; - value_type* buffer_; - size_type size_; - cudaStream_t stream_; - public: vector_t(handle_t const& raft_handle, size_type sz) - : handle_(raft_handle), - buffer_( - static_cast(raft_handle.get_device_allocator()->allocate( - sz * sizeof(value_type), raft_handle.get_stream()))), - size_(sz), - stream_(raft_handle.get_stream()) {} - - ~vector_t(void) { - handle_.get_device_allocator()->deallocate( - buffer_, size_ * sizeof(value_type), stream_); - } + : buffer_(sz, raft_handle.get_stream()), + thrust_policy(raft_handle.get_thrust_policy()) {} - size_type size(void) const { return size_; } + size_type size(void) const { return buffer_.size(); } - value_type* raw(void) { return buffer_; } + value_type* raw(void) { return buffer_.data(); } - value_type const* raw(void) const { return buffer_; } + value_type const* raw(void) const { return buffer_.data(); } - template - value_type nrm1(ThrustExecPolicy t_exe_pol) const { - return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, + value_type nrm1() const { + return thrust::reduce(thrust_policy, buffer_.data(), + buffer_.data() + buffer_.size(), value_type{0}, [] __device__(auto left, auto right) { auto abs_left = left > 0 ? left : -left; auto abs_right = right > 0 ? right : -right; @@ -125,10 +104,15 @@ class vector_t { }); } - template - void fill(ThrustExecPolicy t_exe_pol, value_type value) { - thrust::fill_n(t_exe_pol, buffer_, size_, value); + void fill(value_type value) { + thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value); } + + private: + using thrust_exec_policy_t = thrust::detail::execute_with_allocator< + rmm::mr::thrust_allocator, thrust::cuda_cub::execute_on_stream_base>; + rmm::device_uvector buffer_; + const thrust_exec_policy_t thrust_policy; }; template @@ -280,31 +264,26 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - template - laplacian_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, - index_type const* row_offsets, + laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : sparse_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz), diagonal_(raft_handle, nrows) { vector_t ones{raft_handle, nrows}; - ones.fill(thrust_exec_policy, 1.0); + ones.fill(1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } - template laplacian_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) : sparse_matrix_t(raft_handle, csr_m.row_offsets_, csr_m.col_indices_, csr_m.values_, csr_m.nrows_, csr_m.nnz_), diagonal_(raft_handle, csr_m.nrows_) { vector_t ones{raft_handle, csr_m.nrows_}; - ones.fill(thrust_exec_policy, 1.0); + ones.fill(1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } @@ -351,27 +330,19 @@ struct laplacian_matrix_t : sparse_matrix_t { template struct modularity_matrix_t : laplacian_matrix_t { - template modularity_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : laplacian_matrix_t( - raft_handle, thrust_exec_policy, row_offsets, col_indices, values, - nrows, nnz) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + raft_handle, row_offsets, col_indices, values, nrows, nnz) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } - template modularity_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, - thrust_exec_policy, csr_m) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + : laplacian_matrix_t(raft_handle, csr_m) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } // y = alpha*A*x + beta*y diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index f8dfe5daa3..fededbfcb4 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -79,19 +78,18 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + handle_t const &handle, sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); std::tuple stats; // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -101,7 +99,7 @@ std::tuple modularity_maximization( // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + modularity_matrix_t B{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -111,7 +109,7 @@ std::tuple modularity_maximization( eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix - transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // notice that at this point the matrix has already been transposed, so we are scaling // columns @@ -119,8 +117,8 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -138,9 +136,8 @@ std::tuple modularity_maximization( * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ -template +template void analyzeModularity(handle_t const &handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const &csr_m, vertex_t nClusters, vertex_t const *__restrict__ clusters, @@ -163,15 +160,15 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + modularity_matrix_t B{handle, csr_m}; // Initialize output modularity = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partModularity, clusters, part_i, Bx, B)) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, + clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } @@ -180,7 +177,7 @@ void analyzeModularity(handle_t const &handle, modularity += partModularity; } - modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy); + modularity = modularity / B.diagonal_.nrm1(); } } // namespace spectral diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 841fca04d9..2df3812a4a 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -62,19 +61,18 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template +template std::tuple partition( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + handle_t const &handle, sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -89,7 +87,7 @@ std::tuple partition( // Initialize Laplacian ///sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + laplacian_matrix_t L{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -99,11 +97,11 @@ std::tuple partition( eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix - transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -129,9 +127,8 @@ std::tuple partition( * @param cost On exit, partition cost function. * @return error flag. */ -template +template void analyzePartition(handle_t const &handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const &csr_m, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { @@ -140,8 +137,8 @@ void analyzePartition(handle_t const &handle, vertex_t i; vertex_t n = csr_m.nrows_; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); weight_t partEdgesCut, clustersize; @@ -155,7 +152,7 @@ void analyzePartition(handle_t const &handle, // Initialize Laplacian ///sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + laplacian_matrix_t L{handle, csr_m}; // Initialize output cost = 0; @@ -164,8 +161,8 @@ void analyzePartition(handle_t const &handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partEdgesCut, clusters, part_i, Lx, L)) { + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, + part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 40dde30a74..c148350c0f 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -108,13 +107,12 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { return cudaSuccess; } -template -void transform_eigen_matrix(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t n, - vertex_t nEigVecs, weight_t* eigVecs) { - auto cublas_h = handle.get_cublas_handle(); +template +void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, + weight_t* eigVecs) { auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); const weight_t zero{0.0}; const weight_t one{1.0}; @@ -187,16 +185,15 @@ struct equal_to_i_op { // Construct indicator vector for ith partition // -template -bool construct_indicator(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t index, - edge_t n, weight_t& clustersize, weight_t& partStats, +template +bool construct_indicator(handle_t const& handle, edge_t index, edge_t n, + weight_t& clustersize, weight_t& partStats, vertex_t const* __restrict__ clusters, vector_t& part_i, vector_t& Bx, laplacian_matrix_t const& B) { - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); thrust::for_each(thrust_exec_policy, thrust::make_zip_iterator(thrust::make_tuple( diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 4ff6cdf5fa..d280b3e95c 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -49,8 +49,7 @@ TEST(Raft, ClusterSolvers) { kmeans_solver_t cluster_solver{cfg}; - EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, - eigvecs, codes)); + EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); } TEST(Raft, ModularitySolvers) { @@ -89,14 +88,12 @@ TEST(Raft, ModularitySolvers) { auto stream = h.get_stream(); sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW( - spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); + EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index e2ed2c01dc..8d5cd68f13 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -77,12 +77,11 @@ class DistanceAdjTest int n = params.n; int k = params.k; bool isRowMajor = params.isRowMajor; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(dist_ref, m * n); - raft::allocate(dist, m * n); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(dist_ref, m * n, stream); + raft::allocate(dist, m * n, stream); r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); @@ -94,7 +93,7 @@ class DistanceAdjTest raft::distance::getWorkspaceSize(x, y, m, n, k); if (worksize != 0) { - raft::allocate(workspace, worksize); + raft::allocate(workspace, worksize, stream); } auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { @@ -103,21 +102,16 @@ class DistanceAdjTest raft::distance::distance( x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(workspace)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(dist_ref)); - CUDA_CHECK(cudaFree(dist)); - } + void TearDown() override {} protected: DistanceAdjInputs params; DataType *x, *y; bool *dist_ref, *dist; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index 9e3290593d..4798d102f3 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -392,13 +392,12 @@ class DistanceTest : public ::testing::TestWithParam> { int k = params.k; DataType metric_arg = params.metric_arg; bool isRowMajor = params.isRowMajor; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(dist_ref, m * n); - raft::allocate(dist, m * n); - raft::allocate(dist2, m * n); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(dist_ref, m * n, stream); + raft::allocate(dist, m * n, stream); + raft::allocate(dist2, m * n, stream); if (distanceType == raft::distance::DistanceType::HellingerExpanded || distanceType == raft::distance::DistanceType::JensenShannon || distanceType == raft::distance::DistanceType::KLDivergence) { @@ -416,7 +415,6 @@ class DistanceTest : public ::testing::TestWithParam> { r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); } - naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, metric_arg); char *workspace = nullptr; @@ -424,28 +422,24 @@ class DistanceTest : public ::testing::TestWithParam> { raft::distance::getWorkspaceSize(x, y, m, n, k); if (worksize != 0) { - raft::allocate(workspace, worksize); + raft::allocate(workspace, worksize, stream); } DataType threshold = -10000.f; distanceLauncher(x, y, dist, dist2, m, n, k, params, threshold, workspace, worksize, stream, isRowMajor, metric_arg); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(workspace)); } void TearDown() override { - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(dist_ref)); - CUDA_CHECK(cudaFree(dist)); - CUDA_CHECK(cudaFree(dist2)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: DistanceInputs params; DataType *x, *y, *dist_ref, *dist, *dist2; + cudaStream_t stream; }; } // end namespace distance diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 4573a070b6..cfea4ee2d9 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -107,13 +107,13 @@ class FusedL2NNTest : public ::testing::TestWithParam> { int n = params.n; int k = params.k; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(xn, m); - raft::allocate(yn, n); - raft::allocate(workspace, sizeof(int) * m); - raft::allocate(min, m); - raft::allocate(min_ref, m); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(xn, m, stream); + raft::allocate(yn, n, stream); + raft::allocate(workspace, sizeof(int) * m, stream); + raft::allocate(min, m, stream); + raft::allocate(min_ref, m, stream); r.uniform(x, m * k, DataT(-1.0), DataT(1.0), stream); r.uniform(y, n * k, DataT(-1.0), DataT(1.0), stream); generateGoldenResult(); @@ -122,15 +122,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(xn)); - CUDA_CHECK(cudaFree(yn)); - CUDA_CHECK(cudaFree(workspace)); - CUDA_CHECK(cudaFree(min_ref)); - CUDA_CHECK(cudaFree(min)); } protected: @@ -282,18 +275,17 @@ class FusedL2NNDetTest : public FusedL2NNTest { void SetUp() override { FusedL2NNTest::SetUp(); int m = this->params.m; - raft::allocate(min1, m); + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(min1, m, stream); } - void TearDown() override { - FusedL2NNTest::TearDown(); - CUDA_CHECK(cudaFree(min1)); - } + void TearDown() override { FusedL2NNTest::TearDown(); } protected: cub::KeyValuePair *min1; static const int NumRepeats = 100; + cudaStream_t stream; void generateGoldenResult() override {} }; diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 328137f42d..15794ef568 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -100,18 +100,15 @@ TEST(Raft, SpectralSolvers) { seed}; kmeans_solver_t cluster_solver{clust_cfg}; - auto stream = h.get_stream(); - - auto t_exe_p = thrust::cuda::par.on(stream); sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - EXPECT_ANY_THROW(spectral::partition( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver, + clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; EXPECT_ANY_THROW( - spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); + spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index dc2846fdba..b28c754a5a 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -20,7 +20,6 @@ #include #include -#include #include "../test_utils.h" #include @@ -44,9 +43,9 @@ TEST_F(MakeMonotonicTest, Result) { float *data, *actual, *expected; - raft::allocate(data, m, true); - raft::allocate(actual, m, true); - raft::allocate(expected, m, true); + raft::allocate(data, m, stream, true); + raft::allocate(actual, m, stream, true); + raft::allocate(expected, m, stream, true); float *data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; @@ -57,17 +56,14 @@ TEST_F(MakeMonotonicTest, Result) { raft::update_device(data, data_h, m, stream); raft::update_device(expected, expected_h, m, stream); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); - make_monotonic(actual, data, m, stream, allocator); + make_monotonic(actual, data, m, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); ASSERT_TRUE(devArrMatch(actual, expected, m, raft::Compare(), stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(actual)); delete data_h; delete expected_h; @@ -76,39 +72,35 @@ TEST_F(MakeMonotonicTest, Result) { TEST(labelTest, Classlabels) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); int n_rows = 6; float *y_d; - raft::allocate(y_d, n_rows); + raft::allocate(y_d, n_rows, stream); float y_h[] = {2, -1, 1, 2, 1, 1}; raft::update_device(y_d, y_h, n_rows, stream); - int n_classes; - float *y_unique_d; - getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator); + rmm::device_uvector y_unique_d(0, stream); + int n_classes = getUniquelabels(y_unique_d, y_d, n_rows, stream); ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, + EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare(), stream)); float *y_relabeled_d; - raft::allocate(y_relabeled_d, n_rows); + raft::allocate(y_relabeled_d, n_rows, stream); - getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream); + getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2, + stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare(), stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(y_d)); - CUDA_CHECK(cudaFree(y_unique_d)); - CUDA_CHECK(cudaFree(y_relabeled_d)); } }; // namespace label }; // namespace raft diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index a2f14a8dbc..28d8d59884 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include "../test_utils.h" @@ -50,7 +50,7 @@ class MergeLabelsTest expected(params.N, stream), R(params.N, stream), mask(params.N, stream), - m(1, stream) {} + m(stream) {} void Run() { raft::update_device(labels_a.data(), params.labels_a.data(), params.N, @@ -76,7 +76,7 @@ class MergeLabelsTest raft::handle_t handle; cudaStream_t stream; rmm::device_uvector labels_a, labels_b, expected, R; - rmm::device_uvector mask, m; + rmm::device_scalar mask, m; }; using MergeLabelsTestI = MergeLabelsTest; diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 04f473f836..08429e18f2 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -24,6 +24,8 @@ */ #include +#include + #include #include #include @@ -65,15 +67,12 @@ void hungarian_test(int problemsize, int costrange, int problemcount, for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize * problemsize); - raft::mr::device::buffer row_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); - raft::mr::device::buffer col_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); + rmm::device_uvector elements_v( + batchsize * problemsize * problemsize, handle.get_stream()); + rmm::device_uvector row_assignment_v(batchsize * problemsize, + handle.get_stream()); + rmm::device_uvector col_assignment_v(batchsize * problemsize, + handle.get_stream()); raft::update_device(elements_v.data(), h_cost, batchsize * problemsize * problemsize, diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 2fc9d4e30f..301f069a33 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -32,10 +32,10 @@ class AddTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); int len = params.len; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InT(-1.0), InT(1.0), stream); r.uniform(in2, len, InT(-1.0), InT(1.0), stream); naiveAddElem(out_ref, in1, in2, len); @@ -43,11 +43,7 @@ class AddTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 3ae4f86066..475d8e58ff 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include #include "../test_utils.h" #include "binary_op.cuh" @@ -48,10 +48,10 @@ class BinaryOpTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); IdxType len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InType(-1.0), InType(1.0), stream); r.uniform(in2, len, InType(-1.0), InType(1.0), stream); naiveAdd(out_ref, in1, in2, len); @@ -136,9 +136,9 @@ class BinaryOpAlignment : public ::testing::Test { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; - mr::device::buffer x(handle.get_device_allocator(), stream, n); - mr::device::buffer y(handle.get_device_allocator(), stream, n); - mr::device::buffer z(handle.get_device_allocator(), stream, n); + rmm::device_uvector x(n, stream); + rmm::device_uvector y(n, stream); + rmm::device_uvector z(n, stream); CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 00236d53fa..00db1715dc 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -19,8 +19,9 @@ #include #include #include -#include -#include +#include +#include + #include #include #include "../test_utils.h" @@ -31,12 +32,11 @@ template class CholeskyR1Test : public ::testing::Test { protected: CholeskyR1Test() - : allocator(handle.get_device_allocator()), - G(allocator, handle.get_stream(), n_rows * n_rows), - L(allocator, handle.get_stream(), n_rows * n_rows), - L_exp(allocator, handle.get_stream(), n_rows * n_rows), - devInfo(allocator, handle.get_stream(), 1), - workspace(allocator, handle.get_stream()) { + : G(n_rows * n_rows, handle.get_stream()), + L(n_rows * n_rows, handle.get_stream()), + L_exp(n_rows * n_rows, handle.get_stream()), + devInfo(handle.get_stream()), + workspace(0, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -105,7 +105,6 @@ class CholeskyR1Test : public ::testing::Test { } raft::handle_t handle; - std::shared_ptr allocator; cusolverDnHandle_t solver_handle; cudaStream_t stream; @@ -120,11 +119,11 @@ class CholeskyR1Test : public ::testing::Test { math_t G2_host[4] = {3, 4, 2, 1}; - raft::mr::device::buffer devInfo; - raft::mr::device::buffer G; - raft::mr::device::buffer L_exp; - raft::mr::device::buffer L; - raft::mr::device::buffer workspace; + rmm::device_scalar devInfo; + rmm::device_uvector G; + rmm::device_uvector L_exp; + rmm::device_uvector L; + rmm::device_uvector workspace; }; typedef ::testing::Types FloatTypes; diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index e45f5651b4..45dbd9dcc4 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -57,11 +57,10 @@ class coalescedReductionTest raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); - raft::allocate(dots_exp, rows); - raft::allocate(dots_act, rows); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, rows, stream); + raft::allocate(dots_act, rows, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); naiveCoalescedReduction(dots_exp, data, cols, rows, stream); @@ -70,18 +69,18 @@ class coalescedReductionTest // Add to result with inplace = true next coalescedReductionLaunch(dots_act, data, cols, rows, stream, true); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: coalescedReductionInputs params; T *data, *dots_exp, *dots_act; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 2396558939..563f96c835 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -51,27 +51,26 @@ class DivideTest ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveDivide(out_ref, in, params.scalar, len, stream); divideScalar(out, in, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: UnaryOpInputs params; T *in, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 159d288174..6e26757cf3 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -50,24 +50,24 @@ class EigTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); int len = params.len; - raft::allocate(cov_matrix, len); + raft::allocate(cov_matrix, len, stream); T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); - raft::allocate(eig_vectors, len); - raft::allocate(eig_vals, params.n_col); - raft::allocate(eig_vectors_jacobi, len); - raft::allocate(eig_vals_jacobi, params.n_col); + raft::allocate(eig_vectors, len, stream); + raft::allocate(eig_vals, params.n_col, stream); + raft::allocate(eig_vectors_jacobi, len, stream); + raft::allocate(eig_vals_jacobi, params.n_col, stream); T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; - raft::allocate(eig_vectors_ref, len); - raft::allocate(eig_vals_ref, params.n_col); + raft::allocate(eig_vectors_ref, len, stream); + raft::allocate(eig_vals_ref, params.n_col, stream); raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream); @@ -82,11 +82,11 @@ class EigTest : public ::testing::TestWithParam> { // test code for comparing two methods len = params.n * params.n; - raft::allocate(cov_matrix_large, len); - raft::allocate(eig_vectors_large, len); - raft::allocate(eig_vectors_jacobi_large, len); - raft::allocate(eig_vals_large, params.n); - raft::allocate(eig_vals_jacobi_large, params.n); + raft::allocate(cov_matrix_large, len, stream); + raft::allocate(eig_vectors_large, len, stream); + raft::allocate(eig_vectors_jacobi_large, len, stream); + raft::allocate(eig_vals_large, params.n, stream); + raft::allocate(eig_vals_jacobi_large, params.n, stream); r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream); @@ -97,15 +97,7 @@ class EigTest : public ::testing::TestWithParam> { sweeps); } - void TearDown() override { - CUDA_CHECK(cudaFree(cov_matrix)); - CUDA_CHECK(cudaFree(eig_vectors)); - CUDA_CHECK(cudaFree(eig_vectors_jacobi)); - CUDA_CHECK(cudaFree(eig_vals)); - CUDA_CHECK(cudaFree(eig_vals_jacobi)); - CUDA_CHECK(cudaFree(eig_vectors_ref)); - CUDA_CHECK(cudaFree(eig_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: EigInputs params; diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b3980f281d..bdd0a08ff6 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -51,36 +51,31 @@ class EigSelTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); int len = params.len; - raft::allocate(cov_matrix, len); + raft::allocate(cov_matrix, len, stream); T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); - raft::allocate(eig_vectors, 12); - raft::allocate(eig_vals, params.n_col); + raft::allocate(eig_vectors, 12, stream); + raft::allocate(eig_vals, params.n_col, stream); T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; - raft::allocate(eig_vectors_ref, 12); - raft::allocate(eig_vals_ref, params.n_col); + raft::allocate(eig_vectors_ref, 12, stream); + raft::allocate(eig_vals_ref, params.n_col, stream); raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream); eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors, eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(cov_matrix)); - CUDA_CHECK(cudaFree(eig_vectors)); - CUDA_CHECK(cudaFree(eig_vals)); - CUDA_CHECK(cudaFree(eig_vectors_ref)); - CUDA_CHECK(cudaFree(eig_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: EigSelInputs params; diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 572951c557..e955f7a354 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -69,9 +69,9 @@ class ScalarMultiplyTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, scalar, len, stream); scalarMultiply(out, in, scalar, len, stream); @@ -156,10 +156,10 @@ class EltwiseAddTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); int len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); r.uniform(in2, len, T(-1.0), T(1.0), stream); naiveAdd(out_ref, in1, in2, len, stream); diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 227bce6a48..5b13fb5362 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include "../test_utils.h" @@ -48,11 +47,10 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, const InType *in3, InType scalar, IdxType len, cudaStream_t stream) { InType *tmp; - allocate(tmp, len); + raft::allocate(tmp, len, stream); eltwiseAdd(tmp, in1, in2, len, stream); eltwiseAdd(out_ref, tmp, in3, len, stream); scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream); - CUDA_CHECK(cudaFree(tmp)); } template @@ -64,35 +62,32 @@ class MapTest ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); IdxType len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(in3, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(in3, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InType(-1.0), InType(1.0), stream); r.uniform(in2, len, InType(-1.0), InType(1.0), stream); r.uniform(in3, len, InType(-1.0), InType(1.0), stream); create_ref(out_ref, in1, in2, in3, params.scalar, len, stream); mapLaunch(out, in1, in2, in3, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(in3)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MapInputs params; InType *in1, *in2, *in3; OutType *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 6e146fa4bb..4a44e59504 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include "../test_utils.h" namespace raft { @@ -74,26 +76,25 @@ class MapReduceTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); auto len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, InType(-1.0), InType(1.0), stream); mapReduceLaunch(out_ref, out, in, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MapReduceInputs params; InType *in; OutType *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { @@ -131,9 +132,7 @@ class MapGenericReduceTest : public ::testing::Test { protected: MapGenericReduceTest() - : allocator(handle.get_device_allocator()), - input(allocator, handle.get_stream(), n), - output(allocator, handle.get_stream(), 1) { + : input(n, handle.get_stream()), output(handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -172,9 +171,8 @@ class MapGenericReduceTest : public ::testing::Test { int n = 1237; raft::handle_t handle; cudaStream_t stream; - std::shared_ptr allocator; - raft::mr::device::buffer input; - raft::mr::device::buffer output; + rmm::device_uvector input; + rmm::device_scalar output; }; using IoTypePair = diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index aa46c78b0f..e017ee0918 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -66,14 +66,13 @@ class MatVecOpTest IdxType N = params.rows, D = params.cols; IdxType len = N * D; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); IdxType vecLen = params.bcastAlongRows ? D : N; - allocate(vec1, vecLen); - allocate(vec2, vecLen); + raft::allocate(vec1, vecLen, stream); + raft::allocate(vec2, vecLen, stream); r.uniform(in, len, (T)-1.0, (T)1.0, stream); r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream); @@ -86,20 +85,18 @@ class MatVecOpTest } matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, params.useTwoVectors, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(vec1)); - CUDA_CHECK(cudaFree(vec2)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(in)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MatVecOpInputs params; T *in, *out, *out_ref, *vec1, *vec2; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 1d3e753de3..d7bda7c27d 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -31,27 +31,26 @@ class MultiplyTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, params.scalar, len, stream); multiplyScalar(out, in, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: UnaryOpInputs params; T *in, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index acc25addd0..5563064982 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -78,9 +78,9 @@ class RowNormTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols, len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); - raft::allocate(dots_exp, rows); - raft::allocate(dots_act, rows); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, rows, stream); + raft::allocate(dots_act, rows, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); @@ -143,10 +143,10 @@ class ColNormTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols, len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); + raft::allocate(data, len, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); - raft::allocate(dots_exp, cols); - raft::allocate(dots_act, cols); + raft::allocate(dots_exp, cols, stream); + raft::allocate(dots_act, cols, stream); naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); @@ -157,13 +157,11 @@ class ColNormTest : public ::testing::TestWithParam> { } else { colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 9082397265..7ceeaf7f8e 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -63,9 +63,9 @@ class ReduceTest int rows = params.rows, cols = params.cols; int len = rows * cols; outlen = params.alongRows ? rows : cols; - raft::allocate(data, len); - raft::allocate(dots_exp, outlen); - raft::allocate(dots_act, outlen); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, outlen, stream); + raft::allocate(dots_act, outlen, stream); r.uniform(data, len, InType(-1.0), InType(1.0), stream); naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream); @@ -82,9 +82,7 @@ class ReduceTest } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 30a9c2e271..7f8319636b 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include namespace raft { namespace linalg { @@ -54,17 +54,20 @@ void unaryAndGemv(OutType *dots, const InType *data, int D, int N, cudaStream_t stream) { //computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns - thrust::device_vector sq(D * N); + rmm::device_uvector sq(D * N, stream); raft::linalg::unaryOp( thrust::raw_pointer_cast(sq.data()), data, D * N, [] __device__(InType v) { return static_cast(v * v); }, stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - thrust::device_vector ones(N, 1); //column vector [1...1] + rmm::device_uvector ones(N, stream); //column vector [1...1] + raft::linalg::unaryOp( + ones.data(), ones.data(), ones.size(), + [=] __device__(OutType input) { return 1; }, stream); OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv( - handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, - thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha, + sq.data(), D, ones.data(), 1, &beta, + dots, 1, stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index b27fa2ac1a..55d8cc0e92 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -49,9 +49,9 @@ class stridedReductionTest int rows = params.rows, cols = params.cols; int len = rows * cols; - raft::allocate(data, len); - raft::allocate(dots_exp, cols); //expected dot products (from test) - raft::allocate(dots_act, cols); //actual dot products (from prim) + raft::allocate(data, len, stream); + raft::allocate(dots_exp, cols, stream); //expected dot products (from test) + raft::allocate(dots_act, cols, stream); //actual dot products (from prim) r.uniform(data, len, T(-1.0), T(1.0), stream); //initialize matrix to random @@ -60,9 +60,7 @@ class stridedReductionTest } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index ced3f65fdd..27dea8503f 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -79,12 +79,11 @@ class SubtractTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); r.uniform(in2, len, T(-1.0), T(1.0), stream); @@ -95,19 +94,18 @@ class SubtractTest : public ::testing::TestWithParam> { subtractScalar(out, out, T(1), len, stream); subtract(in1, in1, in2, len, stream); subtractScalar(in1, in1, T(1), len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: SubtractInputs params; T *in1, *in2, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf2 = { diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index fff321768f..72a27790de 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -48,8 +48,8 @@ class SvdTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream = handle.get_stream(); - raft::allocate(data, len); + stream = handle.get_stream(); + raft::allocate(data, len, stream); ASSERT(params.n_row == 3, "This test only supports nrows=3!"); ASSERT(params.len == 6, "This test only supports len=6!"); @@ -59,9 +59,9 @@ class SvdTest : public ::testing::TestWithParam> { int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; - raft::allocate(left_eig_vectors_qr, left_evl); - raft::allocate(right_eig_vectors_trans_qr, right_evl); - raft::allocate(sing_vals_qr, params.n_col); + raft::allocate(left_eig_vectors_qr, left_evl, stream); + raft::allocate(right_eig_vectors_trans_qr, right_evl, stream); + raft::allocate(sing_vals_qr, params.n_col, stream); // allocate(left_eig_vectors_jacobi, left_evl); // allocate(right_eig_vectors_trans_jacobi, right_evl); @@ -74,9 +74,9 @@ class SvdTest : public ::testing::TestWithParam> { T sing_vals_ref_h[] = {7.065283, 1.040081}; - raft::allocate(left_eig_vectors_ref, left_evl); - raft::allocate(right_eig_vectors_ref, right_evl); - raft::allocate(sing_vals_ref, params.n_col); + raft::allocate(left_eig_vectors_ref, left_evl, stream); + raft::allocate(right_eig_vectors_ref, right_evl, stream); + raft::allocate(sing_vals_ref, params.n_col, stream); raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream); @@ -87,22 +87,16 @@ class SvdTest : public ::testing::TestWithParam> { svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr, left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(left_eig_vectors_qr)); - CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr)); - CUDA_CHECK(cudaFree(sing_vals_qr)); - CUDA_CHECK(cudaFree(left_eig_vectors_ref)); - CUDA_CHECK(cudaFree(right_eig_vectors_ref)); - CUDA_CHECK(cudaFree(sing_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: SvdInputs params; T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref; + cudaStream_t stream; }; const std::vector> inputsf2 = { diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index f10b029962..c574f54a05 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -48,26 +48,22 @@ class TransposeTest : public ::testing::TestWithParam> { int len = params.len; - raft::allocate(data, len); + raft::allocate(data, len, stream); ASSERT(params.len == 9, "This test works only with len=9!"); T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; raft::update_device(data, data_h, len, stream); - raft::allocate(data_trans_ref, len); + raft::allocate(data_trans_ref, len, stream); T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0}; raft::update_device(data_trans_ref, data_ref_h, len, stream); - raft::allocate(data_trans, len); + raft::allocate(data_trans, len, stream); transpose(handle, data, data_trans, params.n_row, params.n_col, stream); transpose(data, params.n_row, stream); } - void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(data_trans)); - CUDA_CHECK(cudaFree(data_trans_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: TranposeInputs params; diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 666ab8619d..042e8b9cbf 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -53,18 +53,15 @@ class UnaryOpTest raft::random::Rng r(params.seed); CUDA_CHECK(cudaStreamCreate(&stream)); auto len = params.len; - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, InType(-1.0), InType(1.0), stream); } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); } virtual void DoTest() { diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 578139623a..63381dec07 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -115,22 +115,22 @@ class MathTest : public ::testing::TestWithParam> { random::Rng r(params.seed); int len = params.len; - allocate(in_power, len); - allocate(out_power_ref, len); - allocate(in_sqrt, len); - allocate(out_sqrt_ref, len); - allocate(in_sign_flip, len); - allocate(out_sign_flip_ref, len); - raft::handle_t handle; - cudaStream_t stream; + stream = handle.get_stream(); CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in_ratio, 4); + raft::allocate(in_power, len, stream); + raft::allocate(out_power_ref, len, stream); + raft::allocate(in_sqrt, len, stream); + raft::allocate(out_sqrt_ref, len, stream); + raft::allocate(in_sign_flip, len, stream); + raft::allocate(out_sign_flip_ref, len, stream); + + raft::allocate(in_ratio, 4, stream); T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0}; update_device(in_ratio, in_ratio_h, 4, stream); - allocate(out_ratio_ref, 4); + raft::allocate(out_ratio_ref, 4, stream); T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375}; update_device(out_ratio_ref, out_ratio_ref_h, 4, stream); @@ -150,9 +150,9 @@ class MathTest : public ::testing::TestWithParam> { naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col); signFlip(in_sign_flip, params.n_row, params.n_col, stream); - allocate(in_recip, 4); - allocate(in_recip_ref, 4); - allocate(out_recip, 4); + raft::allocate(in_recip, 4, stream); + raft::allocate(in_recip_ref, 4, stream); + raft::allocate(out_recip, 4, stream); // default threshold is 1e-15 std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; @@ -167,38 +167,23 @@ class MathTest : public ::testing::TestWithParam> { std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; - allocate(in_smallzero, 4); - allocate(out_smallzero, 4); - allocate(out_smallzero_ref, 4); + raft::allocate(in_smallzero, 4, stream); + raft::allocate(out_smallzero, 4, stream); + raft::allocate(out_smallzero_ref, 4, stream); update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream); update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream); setSmallValuesZero(out_smallzero, in_smallzero, 4, stream); setSmallValuesZero(in_smallzero, 4, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(in_power)); - CUDA_CHECK(cudaFree(out_power_ref)); - CUDA_CHECK(cudaFree(in_sqrt)); - CUDA_CHECK(cudaFree(out_sqrt_ref)); - CUDA_CHECK(cudaFree(in_ratio)); - CUDA_CHECK(cudaFree(out_ratio_ref)); - CUDA_CHECK(cudaFree(in_sign_flip)); - CUDA_CHECK(cudaFree(out_sign_flip_ref)); - CUDA_CHECK(cudaFree(in_recip)); - CUDA_CHECK(cudaFree(in_recip_ref)); - CUDA_CHECK(cudaFree(out_recip)); - CUDA_CHECK(cudaFree(in_smallzero)); - CUDA_CHECK(cudaFree(out_smallzero)); - CUDA_CHECK(cudaFree(out_smallzero_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: MathInputs params; T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 28222c0697..cc88df0a73 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include "../test_utils.h" namespace raft { @@ -43,11 +44,10 @@ class MatrixTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(in1_revr, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(in1_revr, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); copy(in1, in2, params.n_row, params.n_col, stream); @@ -55,20 +55,20 @@ class MatrixTest : public ::testing::TestWithParam> { // colReverse(in1_revr, params.n_row, params.n_col); T *outTrunc; - raft::allocate(outTrunc, 6); + raft::allocate(outTrunc, 6, stream); truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - // CUDA_CHECK(cudaFree(in1_revr)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MatrixInputs params; T *in1, *in2, *in1_revr; + cudaStream_t stream; }; const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; @@ -102,17 +102,16 @@ class MatrixCopyRowsTest : public ::testing::Test { protected: MatrixCopyRowsTest() - : allocator(handle.get_device_allocator()), - input(allocator, handle.get_stream(), n_cols * n_rows), - indices(allocator, handle.get_stream(), n_selected), - output(allocator, handle.get_stream(), n_cols * n_selected) { + : input(n_cols * n_rows, handle.get_stream()), + indices(n_selected, handle.get_stream()), + output(n_cols * n_selected, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, + thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr); } @@ -143,10 +142,9 @@ class MatrixCopyRowsTest : public ::testing::Test { 14, 21, 22, 23, 27, 28, 29}; raft::handle_t handle; cudaStream_t stream; - std::shared_ptr allocator; - raft::mr::device::buffer input; - raft::mr::device::buffer output; - raft::mr::device::buffer indices; + rmm::device_uvector input; + rmm::device_uvector output; + rmm::device_uvector indices; }; using TypeTuple = diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index 223efdbfe8..fe42cea8b3 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -15,22 +15,21 @@ */ #include +#include #include #include -#include +#include #include -#include namespace raft { namespace mr { namespace device { TEST(Raft, DeviceBufferAlloc) { - auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction - buffer buff(alloc, stream); + rmm::device_uvector buff(0, stream); ASSERT_EQ(0, buff.size()); // explicit allocation after construction buff.resize(20, stream); @@ -39,12 +38,12 @@ TEST(Raft, DeviceBufferAlloc) { buff.resize(10, stream); ASSERT_EQ(10, buff.size()); // explicit deallocation - buff.release(stream); + buff.release(); ASSERT_EQ(0, buff.size()); // use these methods without the explicit stream parameter - buff.resize(20); + buff.resize(20, stream); ASSERT_EQ(20, buff.size()); - buff.resize(10); + buff.resize(10, stream); ASSERT_EQ(10, buff.size()); buff.release(); ASSERT_EQ(0, buff.size()); @@ -62,11 +61,10 @@ TEST(Raft, DeviceBufferZeroResize) { rmm::mr::set_current_device_resource(limit_mr.get()); - auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction - buffer buff(alloc, stream, 10); + rmm::device_uvector buff(10, stream); ASSERT_EQ(10, buff.size()); // explicit allocation after construction buff.resize(0, stream); @@ -75,7 +73,7 @@ TEST(Raft, DeviceBufferZeroResize) { buff.resize(20, stream); ASSERT_EQ(20, buff.size()); // explicit deallocation - buff.release(stream); + buff.release(); ASSERT_EQ(0, buff.size()); // Now check that there is no memory left. (Used to not be true) diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index 94f81cddb8..781e6d1d3f 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -14,6 +14,12 @@ * limitations under the License. */ +#include + +#include +#include +#include +#include #include "test_utils.h" #include @@ -128,11 +134,18 @@ class MSTTest v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); - rmm::device_vector mst_src(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector mst_dst(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector color(v, 0); + rmm::device_uvector mst_src(2 * v - 2, handle.get_stream()); + rmm::device_uvector mst_dst(2 * v - 2, handle.get_stream()); + rmm::device_uvector color(v, handle.get_stream()); + + CUDA_CHECK( + cudaMemsetAsync(mst_src.data(), std::numeric_limits::max(), + mst_src.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK( + cudaMemsetAsync(mst_dst.data(), std::numeric_limits::max(), + mst_dst.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), + handle.get_stream())); vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); @@ -215,7 +228,6 @@ class MSTTest protected: MSTTestInput mst_input; CSRDevice csr_d; - rmm::device_vector mst_edge; vertex_t v; edge_t e; int iterations; diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index af10dcab30..c2ec7a340f 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -85,11 +85,10 @@ class RngTest : public ::testing::TestWithParam> { // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; params = ::testing::TestWithParam>::GetParam(); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(data, params.len); - allocate(stats, 2, true); + raft::allocate(data, params.len, stream); + raft::allocate(stats, 2, stream, true); switch (params.type) { case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); @@ -124,12 +123,12 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(T meanvar[2]) { @@ -182,6 +181,7 @@ class RngTest : public ::testing::TestWithParam> { T *data, *stats; T h_stats[2]; // mean, var int num_sigma; + cudaStream_t stream; }; // The measured mean and standard deviation for each tested distribution are, @@ -383,9 +383,9 @@ TEST(Rng, MeanError) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_result, num_experiments); - allocate(std_result, num_experiments); + raft::allocate(data, len, stream); + raft::allocate(mean_result, num_experiments, stream); + raft::allocate(std_result, num_experiments, stream); for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) { Rng r(seed, rtype); @@ -416,10 +416,8 @@ TEST(Rng, MeanError) { ASSERT_TRUE( (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_result)); - CUDA_CHECK(cudaFree(std_result)); // std::cout << "mean_res:" << h_mean_result << "\n"; } @@ -432,7 +430,7 @@ class ScaledBernoulliTest : public ::testing::Test { Rng r(42); - allocate(data, len * sizeof(T), stream); + raft::allocate(data, len * sizeof(T), stream); r.scaled_bernoulli(data, len, T(0.5), T(scale), stream); } @@ -463,7 +461,7 @@ class BernoulliTest : public ::testing::Test { void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); - allocate(data, len * sizeof(bool), stream); + raft::allocate(data, len * sizeof(bool), stream); r.bernoulli(data, len, T(0.5), stream); } @@ -515,12 +513,11 @@ class RngNormalTableTest params = ::testing::TestWithParam>::GetParam(); int len = params.rows * params.cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(data, len); - allocate(stats, 2, true); - allocate(mu_vec, params.cols); + raft::allocate(data, len, stream); + raft::allocate(stats, 2, stream, true); + raft::allocate(mu_vec, params.cols, stream); r.fill(mu_vec, params.cols, params.mu, stream); T* sigma_vec = nullptr; r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, @@ -532,13 +529,12 @@ class RngNormalTableTest CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); - CUDA_CHECK(cudaFree(mu_vec)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(T meanvar[2]) { @@ -551,6 +547,7 @@ class RngNormalTableTest T *data, *stats, *mu_vec; T h_stats[2]; // mean, var int num_sigma; + cudaStream_t stream; }; typedef RngNormalTableTest RngNormalTableTestF; diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index 92f12206e8..a98619e5b4 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -70,10 +70,9 @@ class RngTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); Rng r(params.seed, params.gtype); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, params.len); - allocate(stats, 2, true); + raft::allocate(data, params.len, stream); + raft::allocate(stats, 2, stream, true); switch (params.type) { case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); @@ -87,12 +86,12 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(float meanvar[2]) { @@ -110,6 +109,7 @@ class RngTest : public ::testing::TestWithParam> { T *data; float *stats; float h_stats[2]; // mean, var + cudaStream_t stream; }; typedef RngTest RngTestU32; diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index d7e52a8958..cf60f46afe 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -50,10 +50,10 @@ class SWoRTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(in, params.len); - allocate(wts, params.len); - allocate(out, params.sampledLen); - allocate(outIdx, params.sampledLen); + raft::allocate(in, params.len, stream); + raft::allocate(wts, params.len, stream); + raft::allocate(out, params.sampledLen, stream); + raft::allocate(outIdx, params.sampledLen, stream); h_outIdx.resize(params.sampledLen); r.uniform(in, params.len, T(-1.0), T(1.0), stream); r.uniform(wts, params.len, T(1.0), T(2.0), stream); @@ -67,12 +67,8 @@ class SWoRTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(wts)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(outIdx)); } protected: diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index 713708d4cd..8429a46941 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -56,27 +56,24 @@ class CSRAddTest cudaStreamCreate(&stream); - raft::allocate(ind_a, n_rows); - raft::allocate(ind_ptr_a, nnz_a); - raft::allocate(values_a, nnz_a); + raft::allocate(ind_a, n_rows, stream); + raft::allocate(ind_ptr_a, nnz_a, stream); + raft::allocate(values_a, nnz_a, stream); - raft::allocate(ind_b, n_rows); - raft::allocate(ind_ptr_b, nnz_b); - raft::allocate(values_b, nnz_b); + raft::allocate(ind_b, n_rows, stream); + raft::allocate(ind_ptr_b, nnz_b, stream); + raft::allocate(values_b, nnz_b, stream); - raft::allocate(ind_verify, n_rows); - raft::allocate(ind_ptr_verify, nnz_result); - raft::allocate(values_verify, nnz_result); + raft::allocate(ind_verify, n_rows, stream); + raft::allocate(ind_ptr_verify, nnz_result, stream); + raft::allocate(values_verify, nnz_result, stream); - raft::allocate(ind_result, n_rows); - raft::allocate(ind_ptr_result, nnz_result); - raft::allocate(values_result, nnz_result); + raft::allocate(ind_result, n_rows, stream); + raft::allocate(ind_ptr_result, nnz_result, stream); + raft::allocate(values_result, nnz_result, stream); } void Run() { - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream); raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream); @@ -96,7 +93,7 @@ class CSRAddTest Index_ nnz = linalg::csr_add_calc_inds( ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, - n_rows, ind_result, alloc, stream); + n_rows, ind_result, stream); ASSERT_TRUE(nnz == nnz_result); ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, @@ -113,18 +110,8 @@ class CSRAddTest } void TearDown() override { - CUDA_CHECK(cudaFree(ind_a)); - CUDA_CHECK(cudaFree(ind_b)); - CUDA_CHECK(cudaFree(ind_result)); - CUDA_CHECK(cudaFree(ind_ptr_a)); - CUDA_CHECK(cudaFree(ind_ptr_b)); - CUDA_CHECK(cudaFree(ind_ptr_verify)); - CUDA_CHECK(cudaFree(ind_ptr_result)); - CUDA_CHECK(cudaFree(values_a)); - CUDA_CHECK(cudaFree(values_b)); - CUDA_CHECK(cudaFree(values_verify)); - CUDA_CHECK(cudaFree(values_result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index d98f9de9c3..dd6ba1479e 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -28,7 +28,6 @@ #include #include -#include #include #include #include @@ -57,14 +56,12 @@ class ConnectComponentsTest : public ::testing::TestWithParam< void basicTest() { raft::handle_t handle; - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); params = ::testing::TestWithParam< ConnectComponentsInputs>::GetParam(); - raft::sparse::COO out_edges( - handle.get_device_allocator(), handle.get_stream()); + raft::sparse::COO out_edges(handle.get_stream()); rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); @@ -77,7 +74,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< /** * 1. Construct knn graph */ - raft::sparse::COO knn_graph_coo(d_alloc, stream); + raft::sparse::COO knn_graph_coo(stream); raft::sparse::selection::knn_graph( handle, data.data(), params.n_row, params.n_col, @@ -85,7 +82,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), - params.n_row + 1, d_alloc, stream); + params.n_row + 1, stream); /** * 2. Construct MST, sorted by weights @@ -112,7 +109,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, - d_alloc, stream); + stream); auto output_mst = raft::mst::mst( handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index ea69ecfc53..4f9c00c7ab 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -43,9 +43,9 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); - raft::allocate(ex_scan, params.ex_scan.size()); - raft::allocate(verify, params.verify.size()); - raft::allocate(result, params.verify.size(), true); + raft::allocate(ex_scan, params.ex_scan.size(), stream); + raft::allocate(verify, params.verify.size(), stream); + raft::allocate(result, params.verify.size(), stream, true); } void Run() { @@ -62,9 +62,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 553ef2ddee..465aad4e7f 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -19,7 +19,6 @@ #include #include "../test_utils.h" -#include #include #include @@ -61,8 +60,6 @@ typedef SparseConvertCSRTest SortedCOOToCSR; TEST_P(SortedCOOToCSR, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); int nnz = 8; @@ -71,14 +68,14 @@ TEST_P(SortedCOOToCSR, Result) { int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; int *exp_h = new int[4]{0, 2, 4, 6}; - raft::allocate(in, nnz, true); - raft::allocate(exp, 4, true); - raft::allocate(out, 4, true); + raft::allocate(in, nnz, stream, true); + raft::allocate(exp, 4, stream, true); + raft::allocate(out, 4, stream, true); raft::update_device(in, in_h, nnz, stream); raft::update_device(exp, exp_h, 4, stream); - convert::sorted_coo_to_csr(in, nnz, out, 4, alloc, stream); + convert::sorted_coo_to_csr(in, nnz, out, 4, stream); ASSERT_TRUE(raft::devArrMatch(out, exp, 4, raft::Compare())); @@ -115,10 +112,10 @@ class CSRAdjGraphTest cudaStreamCreate(&stream); nnz = params.verify.size(); - raft::allocate(row_ind, params.n_rows); - raft::allocate(adj, params.n_rows * params.n_cols); - raft::allocate(result, nnz, true); - raft::allocate(verify, nnz); + raft::allocate(row_ind, params.n_rows, stream); + raft::allocate(adj, params.n_rows * params.n_cols, stream); + raft::allocate(result, nnz, stream, true); + raft::allocate(verify, nnz, stream); } void Run() { @@ -135,11 +132,8 @@ class CSRAdjGraphTest } void TearDown() override { - CUDA_CHECK(cudaFree(row_ind)); - CUDA_CHECK(cudaFree(adj)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 625772a842..00e6899cb2 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -19,8 +19,6 @@ #include #include -#include -#include #include @@ -61,9 +59,9 @@ class CSRRowSliceTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -73,9 +71,9 @@ class CSRRowSliceTest std::vector out_indices_ref_h = params.out_indices_ref_h; std::vector out_data_ref_h = params.out_data_ref_h; - allocate(out_indptr_ref, out_indptr_ref_h.size()); - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_data_ref, out_data_ref_h.size()); + raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_data_ref, out_data_ref_h.size(), stream); update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); @@ -84,16 +82,14 @@ class CSRRowSliceTest update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); - allocate(out_indptr, out_indptr_ref_h.size()); - allocate(out_indices, out_indices_ref_h.size()); - allocate(out_data, out_data_ref_h.size()); + raft::allocate(out_indptr, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices, out_indices_ref_h.size(), stream); + raft::allocate(out_data, out_data_ref_h.size(), stream); } void SetUp() override { params = ::testing::TestWithParam< CSRRowSliceInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); make_data(); @@ -113,16 +109,8 @@ class CSRRowSliceTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indptr)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_data)); - CUDA_CHECK(cudaFree(out_indptr_ref)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_data_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 5535df4fe3..7f6b7dad07 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -16,8 +16,6 @@ #include #include -#include -#include #include #include @@ -57,9 +55,9 @@ class CSRToDenseTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -67,18 +65,16 @@ class CSRToDenseTest std::vector out_ref_h = params.out_ref_h; - allocate(out_ref, out_ref_h.size()); + raft::allocate(out_ref, out_ref_h.size(), stream); update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream); - allocate(out, out_ref_h.size()); + raft::allocate(out, out_ref_h.size(), stream); } void SetUp() override { params = ::testing::TestWithParam< CSRToDenseInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); @@ -92,12 +88,8 @@ class CSRToDenseTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index c257d6eb3c..e50a9d94a9 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -20,8 +20,6 @@ #include #include -#include -#include #include @@ -63,9 +61,9 @@ class CSRTransposeTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -75,9 +73,9 @@ class CSRTransposeTest std::vector out_indices_ref_h = params.out_indices_ref_h; std::vector out_data_ref_h = params.out_data_ref_h; - allocate(out_indptr_ref, out_indptr_ref_h.size()); - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_data_ref, out_data_ref_h.size()); + raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_data_ref, out_data_ref_h.size(), stream); update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); @@ -86,16 +84,14 @@ class CSRTransposeTest update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); - allocate(out_indptr, out_indptr_ref_h.size()); - allocate(out_indices, out_indices_ref_h.size()); - allocate(out_data, out_data_ref_h.size()); + raft::allocate(out_indptr, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices, out_indices_ref_h.size(), stream); + raft::allocate(out_data, out_data_ref_h.size(), stream); } void SetUp() override { params = ::testing::TestWithParam< CSRTransposeInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); @@ -103,23 +99,15 @@ class CSRTransposeTest raft::sparse::linalg::csr_transpose( handle, indptr, indices, data, out_indptr, out_indices, out_data, - params.nrows, params.ncols, params.nnz, alloc, stream); + params.nrows, params.ncols, params.nnz, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indptr)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_data)); - CUDA_CHECK(cudaFree(out_indptr_ref)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_data_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index 5d687ad92b..f8a469af45 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -48,25 +48,27 @@ const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; TEST_P(COODegree, Result) { + cudaStream_t stream; + cudaStreamCreate(&stream); int *in_rows, *verify, *results; int in_rows_h[5] = {0, 0, 1, 2, 2}; int verify_h[5] = {2, 1, 2, 0, 0}; - raft::allocate(in_rows, 5); - raft::allocate(verify, 5, true); - raft::allocate(results, 5, true); + raft::allocate(in_rows, 5, stream); + raft::allocate(verify, 5, stream, true); + raft::allocate(results, 5, stream, true); - raft::update_device(in_rows, *&in_rows_h, 5, 0); - raft::update_device(verify, *&verify_h, 5, 0); + raft::update_device(in_rows, *&in_rows_h, 5, stream); + raft::update_device(verify, *&verify_h, 5, stream); - linalg::coo_degree<32>(in_rows, 5, results, 0); + linalg::coo_degree<32>(in_rows, 5, results, stream); cudaDeviceSynchronize(); ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } typedef SparseDegreeTests COODegreeNonzero; @@ -81,23 +83,21 @@ TEST_P(COODegreeNonzero, Result) { float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; int verify_h[5] = {1, 0, 2, 0, 0}; - raft::allocate(in_rows, 5); - raft::allocate(verify, 5, true); - raft::allocate(results, 5, true); - raft::allocate(in_vals, 5, true); + raft::allocate(in_rows, 5, stream); + raft::allocate(verify, 5, stream, true); + raft::allocate(results, 5, stream, true); + raft::allocate(in_vals, 5, stream, true); - raft::update_device(in_rows, *&in_rows_h, 5, 0); - raft::update_device(verify, *&verify_h, 5, 0); - raft::update_device(in_vals, *&in_vals_h, 5, 0); + raft::update_device(in_rows, *&in_rows_h, 5, stream); + raft::update_device(verify, *&verify_h, 5, stream); + raft::update_device(in_vals, *&in_vals_h, 5, stream); linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream); cudaDeviceSynchronize(); ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); - + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index a83b93f83f..563dcf6f15 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -94,10 +94,9 @@ class SparseDistanceCOOSPMVTest template void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) { - raft::mr::device::buffer coo_rows( - dist_config.handle.get_device_allocator(), - dist_config.handle.get_stream(), - max(dist_config.b_nnz, dist_config.a_nnz)); + rmm::device_uvector coo_rows( + max(dist_config.b_nnz, dist_config.a_nnz), + dist_config.handle.get_stream()); raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, coo_rows.data(), dist_config.b_nnz, @@ -161,9 +160,9 @@ class SparseDistanceCOOSPMVTest std::vector indices_h = params.input_configuration.indices_h; std::vector data_h = params.input_configuration.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -174,7 +173,8 @@ class SparseDistanceCOOSPMVTest std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; - allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1), + handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); @@ -201,21 +201,14 @@ class SparseDistanceCOOSPMVTest int out_size = dist_config.a_nrows * dist_config.b_nrows; - allocate(out_dists, out_size); + raft::allocate(out_dists, out_size, handle.get_stream()); run_spmv(); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 0589637061..4b531992f0 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include @@ -82,21 +81,14 @@ class SparseDistanceTest int out_size = dist_config.a_nrows * dist_config.b_nrows; - allocate(out_dists, out_size); + raft::allocate(out_dists, out_size, handle.get_stream()); pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, @@ -110,9 +102,9 @@ class SparseDistanceTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -122,7 +114,8 @@ class SparseDistanceTest std::vector out_dists_ref_h = params.out_dists_ref_h; - allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1), + handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), dist_config.handle.get_stream()); diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index f7954f899f..4634e5fc0e 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -20,7 +20,6 @@ #include "../test_utils.h" #include -#include #include #include @@ -53,13 +52,11 @@ typedef SparseFilterTests COORemoveZeros; TEST_P(COORemoveZeros, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); params = ::testing::TestWithParam>::GetParam(); float *in_h_vals = new float[params.nnz]; - COO in(alloc, stream, params.nnz, 5, 5); + COO in(stream, params.nnz, 5, 5); raft::random::Rng r(params.seed); r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream); @@ -82,7 +79,7 @@ TEST_P(COORemoveZeros, Result) { raft::update_device(in.cols(), in_h_cols, params.nnz, stream); raft::update_device(in.vals(), in_h_vals, params.nnz, stream); - op::coo_sort(&in, alloc, stream); + op::coo_sort(&in, stream); int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; @@ -91,14 +88,14 @@ TEST_P(COORemoveZeros, Result) { out_vals_ref_h[0] = in_h_vals[4]; out_vals_ref_h[1] = in_h_vals[1]; - COO out_ref(alloc, stream, 2, 5, 5); - COO out(alloc, stream); + COO out_ref(stream, 2, 5, 5); + COO out(stream); raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream); raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream); raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream); - op::coo_remove_zeros<32, float>(&in, &out, alloc, stream); + op::coo_remove_zeros<32, float>(&in, &out, stream); ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 8c3bf36318..22f97559b1 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -24,8 +24,6 @@ #include #include -#include -#include namespace raft { namespace sparse { @@ -82,15 +80,7 @@ class SparseKNNTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, @@ -105,9 +95,9 @@ class SparseKNNTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -118,16 +108,17 @@ class SparseKNNTest std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_dists_ref, out_dists_ref_h.size()); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), + handle.get_stream()); + raft::allocate(out_dists_ref, out_dists_ref_h.size(), handle.get_stream()); update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); - allocate(out_dists, n_rows * k); - allocate(out_indices, n_rows * k); + raft::allocate(out_dists, n_rows * k, handle.get_stream()); + raft::allocate(out_indices, n_rows * k, handle.get_stream()); } raft::handle_t handle; diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index ec41b32374..e259eafa70 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "../test_utils.h" @@ -64,12 +65,11 @@ class KNNGraphTest raft::handle_t handle; - auto alloc = handle.get_device_allocator(); stream = handle.get_stream(); - out = new raft::sparse::COO(alloc, stream); + out = new raft::sparse::COO(stream); - allocate(X, params.X.size()); + raft::allocate(X, params.X.size(), stream); update_device(X, params.X.data(), params.X.size(), stream); @@ -77,9 +77,8 @@ class KNNGraphTest handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out); - rmm::device_uvector sum(1, stream); - - CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream)); + rmm::device_scalar sum(stream); + sum.set_value_to_zero_async(stream); /** * Assert the knn graph is symmetric @@ -87,12 +86,13 @@ class KNNGraphTest assert_symmetry<<nnz, 256), 256, 0, stream>>>( out->rows(), out->cols(), out->vals(), out->nnz, sum.data()); - raft::update_host(&sum_h, sum.data(), 1, stream); + sum_h = sum.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(X)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); delete out; } diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index a157a17e30..3bd144ef54 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -19,8 +19,6 @@ #include #include #include -#include -#include #include #include @@ -108,18 +106,16 @@ __global__ void computeTheNumerator(const T* firstClusterArray, * @param firstClusterArray: the array of classes of type T * @param secondClusterArray: the array of classes of type T * @param size: the size of the data points of type uint64_t -* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr * @param stream: the cudaStream object */ template -double compute_rand_index( - T* firstClusterArray, T* secondClusterArray, uint64_t size, - std::shared_ptr allocator, cudaStream_t stream) { +double compute_rand_index(T* firstClusterArray, T* secondClusterArray, + uint64_t size, cudaStream_t stream) { //rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); //allocating and initializing memory for a and b in the GPU - raft::mr::device::buffer arr_buf(allocator, stream, 2); + rmm::device_uvector arr_buf(2, stream); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); //kernel configuration @@ -159,30 +155,27 @@ template class LinkageTest : public ::testing::TestWithParam> { protected: void basicTest() { - raft::handle_t handle; + CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, stream); // Allocate result labels and expected labels on device - raft::allocate(labels, params.n_row); - raft::allocate(labels_ref, params.n_row); + raft::allocate(labels, params.n_row, stream); + raft::allocate(labels_ref, params.n_row, stream); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); - raft::copy(labels_ref, params.expected_labels.data(), params.n_row, - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), stream); + raft::copy(labels_ref, params.expected_labels.data(), params.n_row, stream); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels; - rmm::device_uvector out_children(params.n_row * 2, - handle.get_stream()); + rmm::device_uvector out_children(params.n_row * 2, stream); out_arrs.children = out_children.data(); + raft::handle_t handle; raft::hierarchy::single_linkage< IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( handle, data.data(), params.n_row, params.n_col, @@ -191,23 +184,21 @@ class LinkageTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - score = - compute_rand_index(labels, labels_ref, params.n_row, - handle.get_device_allocator(), handle.get_stream()); + score = compute_rand_index(labels, labels_ref, params.n_row, stream); } void SetUp() override { basicTest(); } void TearDown() override { - CUDA_CHECK(cudaFree(labels)); - CUDA_CHECK(cudaFree(labels_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: LinkageInputs params; IdxT *labels, *labels_ref; - double score; + cudaStream_t stream; }; const std::vector> linkage_inputsf2 = { diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 7adbbf8b9a..d69dd15c57 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -47,10 +47,10 @@ class CSRRowNormalizeTest CSRRowNormalizeInputs>::GetParam(); cudaStreamCreate(&stream); - raft::allocate(in_vals, params.in_vals.size()); - raft::allocate(verify, params.verify.size()); - raft::allocate(ex_scan, params.ex_scan.size()); - raft::allocate(result, params.verify.size(), true); + raft::allocate(in_vals, params.in_vals.size(), stream); + raft::allocate(verify, params.verify.size(), stream); + raft::allocate(ex_scan, params.ex_scan.size(), stream); + raft::allocate(result, params.verify.size(), stream, true); } void Run() { @@ -77,11 +77,8 @@ class CSRRowNormalizeTest } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(in_vals)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 50b5dc5993..8ff4a600bc 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -53,7 +53,6 @@ class SparseReduceTest void Run() { raft::handle_t handle; - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); rmm::device_uvector in_rows(params.in_rows.size(), stream); @@ -76,7 +75,7 @@ class SparseReduceTest raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); - raft::sparse::COO out(d_alloc, stream); + raft::sparse::COO out(stream); raft::sparse::op::max_duplicates(handle, out, in_rows.data(), in_cols.data(), in_vals.data(), params.in_rows.size(), params.m, params.n); diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index b64fa25883..805a3d85da 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -59,9 +59,9 @@ class CSRRowOpTest n_rows = params.ex_scan.size(); nnz = params.verify.size(); - raft::allocate(verify, nnz); - raft::allocate(ex_scan, n_rows); - raft::allocate(result, nnz, true); + raft::allocate(verify, nnz, stream); + raft::allocate(ex_scan, n_rows, stream); + raft::allocate(result, nnz, stream, true); } void Run() { @@ -75,10 +75,8 @@ class CSRRowOpTest } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 46f2f6a844..256ecfdfb7 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -57,32 +57,30 @@ class SparseSelectionTest void make_data() { std::vector dists_h = params.dists_h; - allocate(dists, n_rows * n_cols); + raft::allocate(dists, n_rows * n_cols, stream); update_device(dists, dists_h.data(), dists_h.size(), stream); - allocate(inds, n_rows * n_cols); + raft::allocate(inds, n_rows * n_cols, stream); iota_fill(inds, n_rows, n_cols, stream); std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_dists_ref, out_dists_ref_h.size()); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_dists_ref, out_dists_ref_h.size(), stream); update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream); - allocate(out_dists, n_rows * k); - allocate(out_indices, n_rows * k); + raft::allocate(out_dists, n_rows * k, stream); + raft::allocate(out_indices, n_rows * k, stream); } void SetUp() override { params = ::testing::TestWithParam< SparseSelectionInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); n_rows = params.n_rows; @@ -99,15 +97,7 @@ class SparseSelectionTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - - CUDA_CHECK(cudaFree(dists)); - CUDA_CHECK(cudaFree(inds)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_dists_ref)); - + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index b9a8b849eb..e73a8a547b 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -20,7 +20,6 @@ #include "../test_utils.h" #include -#include #include @@ -55,10 +54,8 @@ TEST_P(COOSort, Result) { raft::random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - raft::allocate(in_vals, params.nnz); + raft::allocate(in_vals, params.nnz, stream); r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); @@ -71,16 +68,16 @@ TEST_P(COOSort, Result) { in_cols_h[i] = i; } - raft::allocate(in_rows, params.nnz); - raft::allocate(in_cols, params.nnz); - raft::allocate(verify, params.nnz); + raft::allocate(in_rows, params.nnz, stream); + raft::allocate(in_cols, params.nnz, stream); + raft::allocate(verify, params.nnz, stream); raft::update_device(in_rows, in_rows_h, params.nnz, stream); raft::update_device(in_cols, in_cols_h, params.nnz, stream); raft::update_device(verify, verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, + op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, stream); ASSERT_TRUE( diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index d104028d2b..35233dc473 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -17,11 +17,13 @@ #include #include #include -#include "../test_utils.h" - #include #include #include +#include +#include + +#include "../test_utils.h" #include @@ -63,9 +65,9 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -78,7 +80,6 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::handle_t handle; - auto alloc = handle.get_device_allocator(); stream = handle.get_stream(); make_data(); @@ -87,23 +88,22 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); - raft::mr::device::buffer coo_rows(alloc, stream, nnz); + rmm::device_uvector coo_rows(nnz, stream); raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream); - raft::sparse::COO out(alloc, stream); + raft::sparse::COO out(stream); raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); - raft::mr::device::buffer sum(alloc, stream, 1); - - CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream)); + rmm::device_scalar sum(stream); + sum.set_value_to_zero_async(stream); assert_symmetry<<>>( out.rows(), out.cols(), out.vals(), out.nnz, sum.data()); - raft::update_host(&sum_h, sum.data(), 1, stream); + sum_h = sum.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -148,9 +148,6 @@ TEST_P(COOSymmetrize, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - int nnz = 8; int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; @@ -164,19 +161,19 @@ TEST_P(COOSymmetrize, Result) { float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; - COO in(alloc, stream, nnz, 4, 4); + COO in(stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); raft::update_device(in.cols(), *&in_cols_h, nnz, stream); raft::update_device(in.vals(), *&in_vals_h, nnz, stream); - COO out(alloc, stream); + COO out(stream); linalg::coo_symmetrize<32, float>( &in, &out, [] __device__(int row, int col, float val, float trans) { return val + trans; }, - alloc, stream); + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index def1f1685b..122d7f2d6a 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include "../test_utils.h" @@ -30,18 +29,18 @@ template class HaversineKNNTest : public ::testing::Test { protected: void basicTest() { - auto alloc = std::make_shared(); + CUDA_CHECK(cudaStreamCreate(&stream)); // Allocate input - raft::allocate(d_train_inputs, n * d); + raft::allocate(d_train_inputs, n * d, stream); // Allocate reference arrays - raft::allocate(d_ref_I, n * n); - raft::allocate(d_ref_D, n * n); + raft::allocate(d_ref_I, n * n, stream); + raft::allocate(d_ref_D, n * n, stream); // Allocate predicted arrays - raft::allocate(d_pred_I, n * n); - raft::allocate(d_pred_D, n * n); + raft::allocate(d_pred_I, n * n, stream); + raft::allocate(d_pred_D, n * n, stream); // make testdata on host std::vector h_train_inputs = { @@ -50,7 +49,7 @@ class HaversineKNNTest : public ::testing::Test { 0.53154002, -1.47049808, 0.72891737, -1.54095137}; h_train_inputs.resize(n); - raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0); + raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, stream); std::vector h_res_D = { 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, @@ -60,34 +59,28 @@ class HaversineKNNTest : public ::testing::Test { 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); - raft::update_device(d_ref_D, h_res_D.data(), n * n, 0); + raft::update_device(d_ref_D, h_res_D.data(), n * n, stream); std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); - raft::update_device(d_ref_I, h_res_I.data(), n * n, 0); + raft::update_device(d_ref_I, h_res_I.data(), n * n, stream); std::vector input_vec = {d_train_inputs}; std::vector sizes_vec = {n}; - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - raft::spatial::knn::detail::haversine_knn( d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void SetUp() override { basicTest(); } void TearDown() override { - CUDA_CHECK(cudaFree(d_train_inputs)); - CUDA_CHECK(cudaFree(d_pred_I)); - CUDA_CHECK(cudaFree(d_pred_D)); - CUDA_CHECK(cudaFree(d_ref_I)); - CUDA_CHECK(cudaFree(d_ref_D)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: @@ -103,6 +96,8 @@ class HaversineKNNTest : public ::testing::Test { value_idx *d_ref_I; value_t *d_ref_D; + + cudaStream_t stream; }; typedef HaversineKNNTest HaversineKNNTestF; diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index de6251d32d..e4d05920c0 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -68,8 +68,8 @@ class KNNTest : public ::testing::TestWithParam { auto stream = handle_.get_stream(); - raft::allocate(actual_labels_, rows_ * k_, true); - raft::allocate(expected_labels_, rows_ * k_, true); + raft::allocate(actual_labels_, rows_ * k_, stream, true); + raft::allocate(expected_labels_, rows_ * k_, stream, true); std::vector input_vec; std::vector sizes_vec; @@ -104,6 +104,8 @@ class KNNTest : public ::testing::TestWithParam { cols_ = params_.input[0].size(); k_ = params_.k; + cudaStream_t stream = handle_.get_stream(); + std::vector row_major_input; for (std::size_t i = 0; i < params_.input.size(); ++i) { for (std::size_t j = 0; j < params_.input[i].size(); ++j) { @@ -111,31 +113,27 @@ class KNNTest : public ::testing::TestWithParam { } } rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), - handle_.get_stream()); + row_major_input.data(), row_major_input.size() * sizeof(float), stream); float *input_ptr = static_cast(input_d.data()); rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), - handle_.get_stream()); + params_.labels.data(), params_.labels.size() * sizeof(int), stream); int *labels_ptr = static_cast(labels_d.data()); - raft::allocate(input_, rows_ * cols_, true); - raft::allocate(search_data_, rows_ * cols_, true); - raft::allocate(indices_, rows_ * k_, true); - raft::allocate(distances_, rows_ * k_, true); - raft::allocate(search_labels_, rows_, true); + raft::allocate(input_, rows_ * cols_, stream, true); + raft::allocate(search_data_, rows_ * cols_, stream, true); + raft::allocate(indices_, rows_ * k_, stream, true); + raft::allocate(distances_, rows_ * k_, stream, true); + raft::allocate(search_labels_, rows_, stream, true); - raft::copy(input_, input_ptr, rows_ * cols_, handle_.get_stream()); - raft::copy(search_data_, input_ptr, rows_ * cols_, handle_.get_stream()); - raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream()); + raft::copy(input_, input_ptr, rows_ * cols_, stream); + raft::copy(search_data_, input_ptr, rows_ * cols_, stream); + raft::copy(search_labels_, labels_ptr, rows_, stream); } void TearDown() override { - CUDA_CHECK(cudaFree(search_data_)); - CUDA_CHECK(cudaFree(indices_)); - CUDA_CHECK(cudaFree(distances_)); - CUDA_CHECK(cudaFree(actual_labels_)); + cudaStream_t stream = handle_.get_stream(); + raft::deallocate_all(stream); } private: diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index e5c2d52764..b85d35e3f8 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -57,27 +57,24 @@ TEST(Raft, SpectralMatrices) { ASSERT_EQ(nullptr, sm2.row_offsets_); auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); - auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args - auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) { - laplacian_matrix_t lm2{h, t_exe_pol, sm2}; + auto cnstr_lm2 = [&h, &sm2](void) { + laplacian_matrix_t lm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args - auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - modularity_matrix_t mm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args - auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) { - modularity_matrix_t mm2{h, t_exe_pol, sm2}; + auto cnstr_mm2 = [&h, &sm2](void) { + modularity_matrix_t mm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 4a3b0ed196..a3c88a92be 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -49,11 +49,10 @@ class MeanTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_act, cols); + raft::allocate(data, len, stream); + raft::allocate(mean_act, cols, stream); r.normal(data, len, params.mean, (T)1.0, stream); meanSGtest(data, stream); @@ -66,13 +65,14 @@ class MeanTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MeanInputs params; T *data, *mean_act; + cudaStream_t stream; }; // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 8b0d607561..b827230b5d 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -47,17 +47,16 @@ class MeanCenterTest params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); auto rows = params.rows, cols = params.cols; auto len = rows * cols; IdxType vecLen = params.bcastAlongRows ? cols : rows; - raft::allocate(out, len); - raft::allocate(out_ref, len); - raft::allocate(data, len); - raft::allocate(meanVec, vecLen); + raft::allocate(out, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(data, len, stream); + raft::allocate(meanVec, vecLen, stream); r.normal(data, len, params.mean, (T)1.0, stream); raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, stream); @@ -65,19 +64,18 @@ class MeanCenterTest params.bcastAlongRows, stream); raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, (T)-1.0); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(meanVec)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MeanCenterInputs params; T *data, *meanVec, *out, *out_ref; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index ff2698788f..fd374249d2 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -47,15 +47,14 @@ class StdDevTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_act, cols); - allocate(stddev_act, cols); - allocate(vars_act, cols); + raft::allocate(data, len, stream); + raft::allocate(mean_act, cols, stream); + raft::allocate(stddev_act, cols, stream); + raft::allocate(vars_act, cols, stream); r.normal(data, len, params.mean, params.stddev, stream); stdVarSGtest(data, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void stdVarSGtest(T *data, cudaStream_t stream) { @@ -73,15 +72,14 @@ class StdDevTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_act)); - CUDA_CHECK(cudaFree(stddev_act)); - CUDA_CHECK(cudaFree(vars_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: StdDevInputs params; T *data, *mean_act, *stddev_act, *vars_act; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index c3140d4588..58ebec7859 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -43,9 +43,8 @@ class SumTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); + raft::allocate(data, len, stream); T data_h[len]; for (int i = 0; i < len; i++) { @@ -54,19 +53,20 @@ class SumTest : public ::testing::TestWithParam> { raft::update_device(data, data_h, len, stream); - raft::allocate(sum_act, cols); + raft::allocate(sum_act, cols, stream); sum(sum_act, data, cols, rows, false, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(sum_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: SumInputs params; T *data, *sum_act; + cudaStream_t stream; }; const std::vector> inputsf = {{0.05f, 1024, 32, 1234ULL}, diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index 6076640312..884d81bed1 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -34,7 +34,5 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil: handle_t() except + handle_t(int ns) except + void set_stream(_Stream s) except + - void set_device_allocator(shared_ptr[allocator] a) except + - shared_ptr[allocator] get_device_allocator() except + _Stream get_stream() except + int get_num_internal_streams() except + From 820e14d621f4c25a1e0dd6b879e01431dee2a300 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 30 Aug 2021 17:08:41 -0500 Subject: [PATCH 012/171] Pin rmm to branch-21.10 and remove warnings from kmeans.hpp (#322) Authors: - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Corey J. Nolet (https://github.com/cjnolet) - Chuck Hastings (https://github.com/ChuckHastings) URL: https://github.com/rapidsai/raft/pull/322 --- cpp/cmake/thirdparty/get_rmm.cmake | 6 +++--- cpp/include/raft/spectral/kmeans.hpp | 16 ++++++---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index e990ab1367..51f959a8d9 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -32,8 +32,8 @@ function(find_and_configure_rmm VERSION) INSTALL_EXPORT_SET raft-exports CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/rmm.git - GIT_TAG 23bbe745af1d988224b5498f7b8e3fe3720532d4 - GIT_SHALLOW FALSE + GIT_TAG branch-${MAJOR_AND_MINOR} + GIT_SHALLOW TRUE OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}" @@ -44,4 +44,4 @@ endfunction() set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00") -find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) \ No newline at end of file +find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index b6f0105487..d089b85518 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -256,7 +256,7 @@ static __global__ void minDistances2(index_type_t n, */ template static __global__ void computeClusterSizes( - index_type_t n, index_type_t k, const index_type_t* __restrict__ codes, + index_type_t n, const index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes) { index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { @@ -341,7 +341,7 @@ static __global__ void divideCentroids( */ template static int chooseNewCentroid(handle_t const& handle, index_type_t n, - index_type_t d, index_type_t k, value_type_t rand, + index_type_t d, value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, value_type_t* __restrict__ centroid) { @@ -353,7 +353,6 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n, index_type_t obsIndex; auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); // Compute cumulative sum of distances @@ -450,7 +449,6 @@ static int initializeCentroids( thrust::uniform_real_distribution uniformDist(0, 1); auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); constexpr index_type_t grid_lower_bound{65535}; @@ -478,8 +476,7 @@ static int initializeCentroids( thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); CHECK_CUDA(stream); - if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, - centroids)) + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid @@ -491,7 +488,7 @@ static int initializeCentroids( // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); @@ -509,7 +506,7 @@ static int initializeCentroids( // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, k, codes, + computeClusterSizes<<>>(n, codes, clusterSizes); CHECK_CUDA(stream); @@ -552,7 +549,6 @@ static int assignCentroids(handle_t const& handle, index_type_t n, index_type_t* __restrict__ clusterSizes, value_type_t* residual_host) { auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); // Compute distance between centroids and observation vectors @@ -852,7 +848,7 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, work, + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, From e806f999d950abe714921bb4ad78e8a2d1bbfae0 Mon Sep 17 00:00:00 2001 From: Micka <9810050+lowener@users.noreply.github.com> Date: Tue, 7 Sep 2021 17:44:41 +0200 Subject: [PATCH 013/171] Fix build doc (#316) Authors: - Micka (https://github.com/lowener) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/316 --- BUILD.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BUILD.md b/BUILD.md index a8d22f18d9..844a563a90 100644 --- a/BUILD.md +++ b/BUILD.md @@ -29,14 +29,14 @@ else(DEFINED ENV{RAFT_PATH}) set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo") ExternalProject_Add(raft - GIT_REPOSITORY git@github.com:dantegd/barge.git + GIT_REPOSITORY git@github.com:rapidsai/raft.git GIT_TAG pinned_commit/git_tag/branch PREFIX ${RAFT_GIT_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "") - set(RAFT_INCLUDE_DIR ${RAFT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable") + set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable") endif(DEFINED ENV{RAFT_PATH}) ``` From f311247690437f669936217b38d62c606244621e Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 9 Sep 2021 14:22:39 -0400 Subject: [PATCH 014/171] Update with rapids cmake new features (#320) This combines some general CMake style cleanup and brings new rapids-cmake features to RAFT including: - Usage of `rapids_cmake_install_lib_dir` to make sure we install raft correctly on non-debain based distro's ( lib64 ), while also handling conda installation requirements ( always lib no matter the distro ) - Usage of `rapids_cpm` pre-configured pacakges - Removal of early termination before `rapids_cpm_find` since a better solution now exists ( https://github.com/rapidsai/rapids-cmake/issues/49 ) Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/320 --- cpp/CMakeLists.txt | 10 +++++----- cpp/cmake/thirdparty/get_cuco.cmake | 4 ---- cpp/cmake/thirdparty/get_gtest.cmake | 27 ++++----------------------- cpp/cmake/thirdparty/get_nccl.cmake | 2 +- cpp/cmake/thirdparty/get_rmm.cmake | 27 ++++----------------------- cpp/cmake/thirdparty/get_thrust.cmake | 15 ++++++--------- cpp/test/CMakeLists.txt | 2 +- 7 files changed, 21 insertions(+), 66 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 04eaf548ce..18dbb25956 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -90,12 +90,12 @@ include(cmake/modules/ConfigureCUDA.cmake) ############################################################################## # - Requirements ------------------------------------------------------------- -if (NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP}) +if (NOT DISABLE_OPENMP) find_package(OpenMP) if(OPENMP_FOUND) message(VERBOSE "RAFT: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") - endif(OPENMP_FOUND) -endif(NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP}) + endif() +endif() # add third party dependencies using CPM rapids_cpm_init() @@ -113,7 +113,7 @@ endif() ############################################################################## # - install targets----------------------------------------------------------- - +rapids_cmake_install_lib_dir( lib_dir ) add_library(raft INTERFACE) add_library(raft::raft ALIAS raft) target_include_directories(raft INTERFACE "$" @@ -133,7 +133,7 @@ INTERFACE target_compile_features(raft INTERFACE cxx_std_17 $) install(TARGETS raft - DESTINATION lib + DESTINATION ${lib_dir} EXPORT raft-exports ) diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake index 1bfac473d5..06b2d17e7b 100644 --- a/cpp/cmake/thirdparty/get_cuco.cmake +++ b/cpp/cmake/thirdparty/get_cuco.cmake @@ -16,10 +16,6 @@ function(find_and_configure_cuco VERSION) - if(TARGET cuco::cuco) - return() - endif() - rapids_cpm_find(cuco ${VERSION} GLOBAL_TARGETS cuco::cuco BUILD_EXPORT_SET raft-exports diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake index 4cd11dab98..7c234283d5 100644 --- a/cpp/cmake/thirdparty/get_gtest.cmake +++ b/cpp/cmake/thirdparty/get_gtest.cmake @@ -14,30 +14,11 @@ # limitations under the License. #============================================================================= -function(find_and_configure_gtest VERSION) +function(find_and_configure_gtest ) - if(TARGET GTest::gtest) - return() - endif() - - rapids_cpm_find(GTest ${VERSION} - GLOBAL_TARGETS gest gtest_main GTest::gtest GTest::gtest_main - CPM_ARGS - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG release-${VERSION} - GIT_SHALLOW TRUE - OPTIONS "INSTALL_GTEST OFF" - # googletest >= 1.10.0 provides a cmake config file -- use it if it exists - FIND_PACKAGE_ARGUMENTS "CONFIG" - ) - - if(NOT TARGET GTest::gtest) - add_library(GTest::gtest ALIAS gtest) - add_library(GTest::gtest_main ALIAS gtest_main) - endif() + include(${rapids-cmake-dir}/cpm/gtest.cmake) + rapids_cpm_gtest() endfunction() -set(RAFT_MIN_VERSION_gtest 1.10.0) - -find_and_configure_gtest(${RAFT_MIN_VERSION_gtest}) +find_and_configure_gtest() diff --git a/cpp/cmake/thirdparty/get_nccl.cmake b/cpp/cmake/thirdparty/get_nccl.cmake index a80eefab80..118ae37704 100644 --- a/cpp/cmake/thirdparty/get_nccl.cmake +++ b/cpp/cmake/thirdparty/get_nccl.cmake @@ -16,7 +16,7 @@ function(find_and_configure_nccl) - if(TARGET nccl::nccl) + if(TARGET NCCL::NCCL) return() endif() diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index 51f959a8d9..7c155d446f 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -14,34 +14,15 @@ # limitations under the License. #============================================================================= -function(find_and_configure_rmm VERSION) +function(find_and_configure_rmm) - if(TARGET rmm::rmm) - return() - endif() - - if(${VERSION} MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) - set(MAJOR_AND_MINOR "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}") - else() - set(MAJOR_AND_MINOR "${VERSION}") - endif() - - rapids_cpm_find(rmm ${VERSION} + include(${rapids-cmake-dir}/cpm/rmm.cmake) + rapids_cpm_rmm( GLOBAL_TARGETS rmm::rmm BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports - CPM_ARGS - GIT_REPOSITORY https://github.com/rapidsai/rmm.git - GIT_TAG branch-${MAJOR_AND_MINOR} - GIT_SHALLOW TRUE - OPTIONS "BUILD_TESTS OFF" - "BUILD_BENCHMARKS OFF" - "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}" - "DISABLE_DEPRECATION_WARNING ${DISABLE_DEPRECATION_WARNING}" ) endfunction() -set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00") - -find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) +find_and_configure_rmm() diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 6dd9a91870..c28ff6e66d 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -13,18 +13,15 @@ # ============================================================================= # Use CPM to find or clone thrust -function(find_and_configure_thrust VERSION) +function(find_and_configure_thrust) + include(${rapids-cmake-dir}/cpm/thrust.cmake) - rapids_cpm_find( - Thrust ${VERSION} + rapids_cpm_thrust( + NAMESPACE raft BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports - CPM_ARGS - GIT_REPOSITORY https://github.com/NVIDIA/thrust.git - GIT_TAG ${VERSION} - GIT_SHALLOW TRUE - OPTIONS "THRUST_INSTALL OFF") + ) endfunction() -find_and_configure_thrust(1.12.0) +find_and_configure_thrust() diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 0428e09142..fb766a5bcd 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -131,7 +131,7 @@ PRIVATE FAISS::FAISS GTest::gtest GTest::gtest_main - OpenMP::OpenMP_CXX Threads::Threads + $ $ ) From f3df0ba74906bbc46e2245b19218674311eda02e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com> Date: Wed, 15 Sep 2021 17:51:32 -0400 Subject: [PATCH 015/171] Add broadcast with const input iterator (#328) The current ```bcast``` function takes a single ```value_t*``` pointer (MPI style) for both input (if root) and output (if non-root). This does not compile if we have ```const value_t*``` pointer for input. This PR adds a ```bcast``` function that takes separate ```const value_t*``` input and ```value_t```` output pointers (NCCL style). Authors: - Seunghwa Kang (https://github.com/seunghwak) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/328 --- cpp/include/raft/comms/comms.hpp | 21 +++++++++++++++++++++ cpp/include/raft/comms/mpi_comms.hpp | 7 +++++++ cpp/include/raft/comms/std_comms.hpp | 7 +++++++ 3 files changed, 35 insertions(+) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index dc172c9503..3f2f6b28f9 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -114,6 +114,10 @@ class comms_iface { virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0; + virtual void bcast(const void* sendbuff, void* recvbuff, size_t count, + datatype_t datatype, int root, + cudaStream_t stream) const = 0; + virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const = 0; @@ -282,6 +286,23 @@ class comms_t { stream); } + /** + * Broadcast data from one rank to the rest + * @tparam value_t datatype of underlying buffers + * @param sendbuff buffer containing data to broadcast (only used in root) + * @param recvbuff buffer to receive broadcasted data + * @param count number of elements if buff + * @param root the rank initiating the broadcast + * @param stream CUDA stream to synchronize operation + */ + template + void bcast(const value_t* sendbuff, value_t* recvbuff, size_t count, int root, + cudaStream_t stream) const { + impl_->bcast(static_cast(sendbuff), + static_cast(recvbuff), count, get_type(), root, + stream); + } + /** * Reduce data from many ranks down to a single rank * @tparam value_t datatype of underlying buffers diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index 8dda74f0a9..067c7bd0ab 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -202,6 +202,13 @@ class mpi_comms : public comms_iface { nccl_comm_, stream)); } + void bcast(const void* sendbuff, void* recvbuff, size_t count, + datatype_t datatype, int root, cudaStream_t stream) const { + NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count, + get_nccl_datatype(datatype), root, nccl_comm_, + stream)); + } + void reduce(const void* sendbuff, void* recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index ff75931fb9..562c548bcb 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -307,6 +307,13 @@ class std_comms : public comms_iface { nccl_comm_, stream)); } + void bcast(const void *sendbuff, void *recvbuff, size_t count, + datatype_t datatype, int root, cudaStream_t stream) const { + NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count, + get_nccl_datatype(datatype), root, nccl_comm_, + stream)); + } + void reduce(const void *sendbuff, void *recvbuff, size_t count, datatype_t datatype, op_t op, int root, cudaStream_t stream) const { From 028609577a3e8d200e63ee306ea661da72732f53 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Thu, 16 Sep 2021 19:39:52 +0200 Subject: [PATCH 016/171] Fix wrong lda parameter in gemv (#327) Fix wrong `lda` parameter in raft::linalg::gemv. lda should always be along `n_rows` direction, independently of `trans_a`. I also took a liberty to add couple more overloads and documentation. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Brad Rees (https://github.com/BradReesWork) - Chuck Hastings (https://github.com/ChuckHastings) URL: https://github.com/rapidsai/raft/pull/327 --- cpp/include/raft/linalg/gemv.h | 129 ++++++++++++++++++++++------- cpp/test/CMakeLists.txt | 1 + cpp/test/linalg/gemv.cu | 143 +++++++++++++++++++++++++++++++++ 3 files changed, 242 insertions(+), 31 deletions(-) create mode 100644 cpp/test/linalg/gemv.cu diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h index edd18b3bee..0be11a0301 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,49 +26,116 @@ namespace raft { namespace linalg { template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols, - const math_t* x, int incx, math_t* y, int incy, bool trans_a, - math_t alpha, math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows, + const int n_cols, const math_t *x, const int incx, math_t *y, + const int incy, const bool trans_a, const math_t alpha, + const math_t beta, cudaStream_t stream) { cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, + incx, &beta, y, incy, stream)); +} - // Unfortunately there is a clash of terminology - // in BLAS https://docs.nvidia.com/cuda/cublas/index.html is opposite to Machine Learning - // In blas: - // m - number of rows in input matrix - // n - number of columns in input matrix - // lda - purpose of it to have ability to operate on submatrices of matrix without copying. - // If you're not think about it it's always should be equal to m - // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose - - // In Machine Learning: - // m - nunmber of columns in design matrix(number of features) - // n - number of rows in designed matrix (number of train examples) +/** + * y = alpha * op(A) * x + beta * y + * + * where + * + * @param A is a column-major matrix of size n_rows_a * n_cols_a. + * op(A) is either the transpose operation (trans_a == true) or identity. + * + * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. + * set it when you need to use only the first n_rows_a rows of the matrix A, which has + * (perhaps, due to padding) lda rows. + * + * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. + * + * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + */ +template +void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, + const int n_cols_a, const math_t *x, math_t *y, const bool trans_a, + const math_t alpha, const math_t beta, cudaStream_t stream) { + gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +} - int m = n_rows; - int n = n_cols; - int lda = trans_a ? m : n; +/** + * y = op(A) * x + * + * where + * + * @param A is a column-major matrix of size n_rows_a * n_cols_a. + * op(A) is either the transpose operation (trans_a == true) or identity. + * + * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. + * + * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + */ +template +void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, + const int n_cols_a, const math_t *x, math_t *y, const bool trans_a, + cudaStream_t stream) { + math_t alpha = math_t(1); + math_t beta = math_t(0); - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, - y, incy, stream)); + gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } +/** + * y = alpha * op(A) * x + beta * y + * + * where + * + * @param alpha is a scalar scale of Ax. + * + * @param beta is a scalar scale of y. + * + * @param A is a column-major matrix of size n_rows_a * n_cols_a. + * op(A) is either the transpose operation (trans_a == true) or identity. + * + * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. + * set it when you need to use only the first n_rows_a rows of the matrix A, which has + * (perhaps, due to padding) lda rows. + * + * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. + * + * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + */ template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, - int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha, - math_t beta, cudaStream_t stream) { - gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, + const int n_cols_a, const int lda, const math_t *x, math_t *y, + const bool trans_a, const math_t alpha, const math_t beta, + cudaStream_t stream) { + cublasHandle_t cublas_h = handle.get_cublas_handle(); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, + 1, &beta, y, 1, stream)); } +/** + * y = op(A) * x + * + * where + * + * @param A is a column-major matrix of size n_rows_a * n_cols_a. + * op(A) is either the transpose operation (trans_a == true) or identity. + * + * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. + * set it when you need to use only the first n_rows_a rows of the matrix A, which has + * (perhaps, due to padding) lda rows. + * + * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. + * + * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * + */ template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, - int n_cols_a, const math_t* x, math_t* y, bool trans_a, - cudaStream_t stream) { +void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, + const int n_cols_a, const int lda, const math_t *x, math_t *y, + const bool trans_a, cudaStream_t stream) { math_t alpha = math_t(1); math_t beta = math_t(0); - - gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); + gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); } }; // namespace linalg diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index fb766a5bcd..42066061f3 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -47,6 +47,7 @@ add_executable(test_raft test/linalg/eig.cu test/linalg/eig_sel.cu test/linalg/gemm_layout.cu + test/linalg/gemv.cu test/linalg/map.cu test/linalg/map_then_reduce.cu test/linalg/matrix_vector_op.cu diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu new file mode 100644 index 0000000000..4a474bc461 --- /dev/null +++ b/cpp/test/linalg/gemv.cu @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace linalg { + +template +struct GemvInputs { + int n_rows; + int n_cols; + int lda; + bool trans_a; + unsigned long long int seed; +}; + +// Reference GEMV implementation. +template +__global__ void naiveGemv(T *y, const T *A, const T *x, const int n_rows, + const int n_cols, const int lda, const bool trans_a) { + int istart = blockIdx.x * blockDim.x + threadIdx.x; + int istep = blockDim.x * gridDim.x; + + if (!trans_a) { + for (int i = istart; i < n_rows; i += istep) { + T t = T(0.0); + for (int j = 0; j < n_cols; j++) { + t += A[i + lda * j] * x[j]; + } + y[i] = t; + } + } else { + for (int i = istart; i < n_cols; i += istep) { + T t = T(0.0); + for (int j = 0; j < n_rows; j++) { + t += A[lda * i + j] * x[j]; + } + y[i] = t; + } + } +} + +template +class GemvTest : public ::testing::TestWithParam> { + protected: + GemvInputs params; + rmm::device_uvector refy; // Reference result for comparison + rmm::device_uvector y; // Computed result + + public: + GemvTest() + : testing::TestWithParam>(), + refy(0, rmm::cuda_stream_default), + y(0, rmm::cuda_stream_default) { + rmm::cuda_stream_default.synchronize(); + } + + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + + raft::handle_t handle; + cudaStream_t stream = handle.get_stream(); + + raft::random::Rng r(params.seed); + + // We compute y = op(A) * x and compare against reference result + size_t aElems = params.lda * params.n_cols; + size_t xElems = params.trans_a ? params.n_rows : params.n_cols; + size_t yElems = params.trans_a ? params.n_cols : params.n_rows; + + rmm::device_uvector A(aElems, stream); + rmm::device_uvector x(xElems, stream); + refy.resize(yElems, stream); + y.resize(yElems, stream); + + r.uniform(x.data(), xElems, T(-10.0), T(10.0), stream); + r.uniform(A.data(), aElems, T(-10.0), T(10.0), stream); + + dim3 blocks(raft::ceildiv(yElems, 256), 1, 1); + dim3 threads(256, 1, 1); + + naiveGemv<<>>(refy.data(), A.data(), x.data(), + params.n_rows, params.n_cols, params.lda, + params.trans_a); + + gemv(handle, A.data(), params.n_rows, params.n_cols, params.lda, x.data(), + y.data(), params.trans_a, stream); + } + + void TearDown() override {} +}; + +const std::vector> inputsf = { + {80, 70, 80, true, 76433ULL}, {80, 100, 80, true, 426646ULL}, + {20, 100, 20, true, 37703ULL}, {100, 60, 200, true, 538004ULL}, + {50, 10, 60, false, 73012ULL}, {90, 90, 90, false, 538147ULL}, + {30, 100, 30, false, 412352ULL}, {40, 80, 100, false, 297941ULL}}; + +const std::vector> inputsd = { + {10, 70, 10, true, 535648ULL}, {30, 30, 30, true, 956681ULL}, + {70, 80, 70, true, 875083ULL}, {80, 90, 200, true, 50744ULL}, + {90, 90, 90, false, 506321ULL}, {40, 100, 70, false, 638418ULL}, + {80, 50, 80, false, 701529ULL}, {50, 80, 60, false, 893038ULL}}; + +typedef GemvTest GemvTestF; +TEST_P(GemvTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(), + params.trans_a ? params.n_cols : params.n_rows, + raft::CompareApprox(1e-4))); +} + +typedef GemvTest GemvTestD; +TEST_P(GemvTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(), + params.trans_a ? params.n_cols : params.n_rows, + raft::CompareApprox(1e-6))); +} + +INSTANTIATE_TEST_SUITE_P(GemvTests, GemvTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_SUITE_P(GemvTests, GemvTestD, ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft From 9ce2fd0d863be9f6dfa3cede7133acc2add705d6 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Thu, 16 Sep 2021 16:48:16 -0400 Subject: [PATCH 017/171] DOC v21.12 Updates --- CHANGELOG.md | 4 ++++ cpp/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8d89d1a8f..cf36c3facd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# raft 21.12.00 (Date TBD) + +Please see https://github.com/rapidsai/raft/releases/tag/v21.12.00a for the latest changes to this development branch. + # raft 21.10.00 (Date TBD) Please see https://github.com/rapidsai/raft/releases/tag/v21.10.00a for the latest changes to this development branch. diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 18dbb25956..4f24b5e8db 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -26,7 +26,7 @@ include(rapids-find) rapids_cuda_init_architectures(RAFT) -project(RAFT VERSION 21.10.00 LANGUAGES CXX CUDA) +project(RAFT VERSION 21.12.00 LANGUAGES CXX CUDA) ############################################################################## # - build type --------------------------------------------------------------- From 65a675f7d66851777f64b79a12af740561b1dc4f Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Fri, 17 Sep 2021 12:33:58 -0700 Subject: [PATCH 018/171] Accounting for rmm::cuda_stream_pool not having a constructor for 0 streams (#329) Comes from this PR https://github.com/rapidsai/rmm/pull/873 Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/329 --- cpp/include/raft/handle.hpp | 28 +++++++++++++++++++++------- cpp/test/cluster_solvers.cu | 1 - cpp/test/eigen_solvers.cu | 2 -- cpp/test/handle.cpp | 14 -------------- cpp/test/spectral_matrix.cu | 1 - 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index c925669530..190062e92f 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -61,8 +61,10 @@ class handle_t { int cur_dev = -1; CUDA_CHECK(cudaGetDevice(&cur_dev)); return cur_dev; - }()), - streams_(n_streams) { + }()) { + if (n_streams != 0) { + streams_ = std::make_unique(n_streams); + } create_resources(); thrust_policy_ = std::make_unique(user_stream_); } @@ -78,10 +80,13 @@ class handle_t { */ handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams) - : dev_id_(other.get_device()), streams_(n_streams) { + : dev_id_(other.get_device()) { RAFT_EXPECTS( other.get_num_internal_streams() > 0, "ERROR: the main handle must have at least one worker stream\n"); + if (n_streams != 0) { + streams_ = std::make_unique(n_streams); + } prop_ = other.get_device_properties(); device_prop_initialized_ = true; create_resources(); @@ -140,14 +145,23 @@ class handle_t { // legacy compatibility for cuML cudaStream_t get_internal_stream(int sid) const { - return streams_.get_stream(sid).value(); + RAFT_EXPECTS( + streams_.get() != nullptr, + "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); + return streams_->get_stream(sid).value(); } // new accessor return rmm::cuda_stream_view rmm::cuda_stream_view get_internal_stream_view(int sid) const { - return streams_.get_stream(sid); + RAFT_EXPECTS( + streams_.get() != nullptr, + "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); + return streams_->get_stream(sid); + } + + int get_num_internal_streams() const { + return streams_.get() != nullptr ? streams_->get_pool_size() : 0; } - int get_num_internal_streams() const { return streams_.get_pool_size(); } std::vector get_internal_streams() const { std::vector int_streams_vec; for (int i = 0; i < get_num_internal_streams(); i++) { @@ -212,7 +226,7 @@ class handle_t { std::unordered_map> subcomms_; const int dev_id_; - rmm::cuda_stream_pool streams_{0}; + std::unique_ptr streams_{nullptr}; mutable cublasHandle_t cublas_handle_; mutable bool cublas_initialized_{false}; mutable cusolverDnHandle_t cusolver_dn_handle_; diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index d280b3e95c..06b246d9a1 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -58,7 +58,6 @@ TEST(Raft, ModularitySolvers) { using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); index_type neigvs{10}; diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 15794ef568..ede790b38c 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -31,7 +31,6 @@ TEST(Raft, EigenSolvers) { using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); index_type* ro{nullptr}; @@ -73,7 +72,6 @@ TEST(Raft, SpectralSolvers) { using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); index_type neigvs{10}; diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 4cb9809844..3e27789078 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -24,7 +24,6 @@ namespace raft { TEST(Raft, HandleDefault) { handle_t h; - ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); ASSERT_EQ(nullptr, h.get_stream()); ASSERT_NE(nullptr, h.get_cublas_handle()); @@ -55,7 +54,6 @@ TEST(Raft, GetHandleFromPool) { handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream(2), child.get_stream()); - ASSERT_EQ(0, child.get_num_internal_streams()); child.set_stream(parent.get_internal_stream(3)); ASSERT_EQ(parent.get_internal_stream(3), child.get_stream()); @@ -64,18 +62,6 @@ TEST(Raft, GetHandleFromPool) { ASSERT_EQ(parent.get_device(), child.get_device()); } -TEST(Raft, GetHandleFromPoolPerf) { - handle_t parent(100); - auto start = curTimeMillis(); - for (int i = 0; i < parent.get_num_internal_streams(); i++) { - handle_t child(parent, i); - ASSERT_EQ(parent.get_internal_stream(i), child.get_stream()); - child.wait_on_user_stream(); - } - // upperbound on 0.1ms per child handle - ASSERT_LE(curTimeMillis() - start, 10); -} - TEST(Raft, GetHandleStreamViews) { handle_t parent(4); diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index b85d35e3f8..388ad56f2d 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -38,7 +38,6 @@ TEST(Raft, SpectralMatrices) { using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); csr_view_t csr_v{nullptr, nullptr, nullptr, 0, 0}; From af3deeafd25b4263c14b93a4d3cb49437d899d35 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com> Date: Mon, 20 Sep 2021 22:39:51 +0530 Subject: [PATCH 019/171] Fused L2 (unexpanded) kNN kernel for NN <= 64, without using temporary gmem to store intermediate distances (#324) benchmarking with cuml python interface kNN datasets (it claims to generate gaussian distribution) tried till 200k x 128 database/query vectors. found some different behavior on my small GPU GP107 vs on GA102(Tesla A40) on GP107 fused L2 kNN is slower on larger datasets on GA102 fused L2 kNN is always faster like approx **1.15x-1.5x** for all datasets I tried (except 200k x 128). I will also have L2 expanded version of fused L2 kNN in a separate PR due to which on larger dimension like > 128 distance computation from fused L2 kNN won't become bottleneck. There is scope to optimize the distance computation in fused L2 kNN as there is no usage of vectorized LDG/STS in it. Overall it looks that fused L2 kNN is better on GPUs with decent compute power but not on small old GPUs like GP107. & benchmarking with cuml cpp kNN regression tests the performance is On A30 (GA100) , For NN == 64, resultant Dist matrix 1M x 1M, Fused L2 kNN = 11550ms FAISS kNN = 23933 ms. **Overall 2.07x faster** And for NN == 32, it is **1.43x faster** runtimes for NN == 32, Fused L2 kNN = 11198ms FAISS kNN = 16124ms Authors: - Mahesh Doijade (https://github.com/mdoijade) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/324 --- cpp/include/raft/linalg/contractions.cuh | 21 + .../raft/spatial/knn/detail/fused_l2_knn.cuh | 612 ++++++++++++++++++ .../knn/detail/knn_brute_force_faiss.cuh | 114 ++-- 3 files changed, 709 insertions(+), 38 deletions(-) create mode 100644 cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index aa711a9140..e6ff8a49ce 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -166,6 +166,27 @@ struct Policy4x4 { }; /** @} */ +/** + * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16 + * @{ + */ +template +struct Policy2x8 {}; + +template +struct Policy2x8 { + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; + +template +struct Policy2x8 { + // this is not used just for keeping compiler happy. + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; +/** @} */ + /** * @brief Base class for gemm-like NT contractions * diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh new file mode 100644 index 0000000000..9d00d9b9f4 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh @@ -0,0 +1,612 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include +#include "processing.hpp" + +namespace raft { +namespace spatial { +namespace knn { +namespace detail { + +template +DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m, + const unsigned int numOfNN) { + const int lid = raft::laneId(); +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto rowId = + (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + if (rowId < m) { +#pragma unroll + for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { + const int idx = j * warpSize + lid; + if (idx < numOfNN) { + Pair KVPair = shDumpKV[rowId * numOfNN + idx]; + heapArr[i]->warpV[j] = KVPair.key; + heapArr[i]->warpK[j] = KVPair.value; + } + } + } + } +} + +template +DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId, + const unsigned int numOfNN) { + const int lid = raft::laneId(); +#pragma unroll + for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) { + const int idx = j * warpSize + lid; + if (idx < numOfNN) { + Pair KVPair = shDumpKV[rowId * numOfNN + idx]; + heapArr->warpV[j] = KVPair.key; + heapArr->warpK[j] = KVPair.value; + } + } +} + +template +DI void storeWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT rowId, + const unsigned int numOfNN) { + const int lid = raft::laneId(); + +#pragma unroll + for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) { + const int idx = j * warpSize + lid; + if (idx < numOfNN) { + Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]); + shDumpKV[rowId * numOfNN + idx] = otherKV; + } + } +} + +template +DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds, + const IdxT m, const unsigned int numOfNN, + const IdxT starty) { + const int lid = raft::laneId(); +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto gmemRowId = starty + i * Policy::AccThRows; + if (gmemRowId < m) { +#pragma unroll + for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { + const auto idx = j * warpSize + lid; + if (idx < numOfNN) { + out_dists[gmemRowId * numOfNN + idx] = heapArr[i]->warpK[j]; + out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j]; + } + } + } + } +} + +template +DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists, + IdxT *out_inds, const IdxT m, + const unsigned int numOfNN, const IdxT starty) { + const int lid = raft::laneId(); +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto gmemRowId = starty + i * Policy::AccThRows; + if (gmemRowId < m) { +#pragma unroll + for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { + const auto idx = j * warpSize + lid; + if (idx < numOfNN) { + heapArr[i]->warpK[j] = out_dists[gmemRowId * numOfNN + idx]; + heapArr[i]->warpV[j] = (uint32_t)out_inds[gmemRowId * numOfNN + idx]; + } + } + auto constexpr kLaneWarpKTop = heapArr[i]->kNumWarpQRegisters - 1; + heapArr[i]->warpKTop = + raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane); + } + } +} + +template +DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId, + int finalNumVals, int startId = 0) { + constexpr uint32_t mask = 0xffffffffu; + const int lid = raft::laneId(); + // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30. + // warp around 0 to 31 required for NN > 32 + const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1); + + for (int k = startId; k < finalNumVals; k++) { + Pair KVPair = allWarpTopKs[rowId * (256) + k]; +#pragma unroll + for (int i = 0; i < NumWarpQRegs; i++) { + unsigned activeLanes = + __ballot_sync(mask, KVPair.value < heapArr->warpK[i]); + if (activeLanes) { + Pair tempKV; + tempKV.value = raft::shfl(heapArr->warpK[i], srcLane); + tempKV.key = raft::shfl(heapArr->warpV[i], srcLane); + const auto firstActiveLane = __ffs(activeLanes); + if (firstActiveLane == (lid + 1)) { + heapArr->warpK[i] = KVPair.value; + heapArr->warpV[i] = KVPair.key; + } else if (activeLanes & ((uint32_t)1 << lid)) { + heapArr->warpK[i] = tempKV.value; + heapArr->warpV[i] = tempKV.key; + } + if (i == 0 && NumWarpQRegs > 1) { + if (lid == 0) { + heapArr->warpK[1] = tempKV.value; + heapArr->warpV[1] = tempKV.key; + } + heapArr->warpK[1] = __shfl_up_sync(mask, heapArr->warpK[1], 1); + heapArr->warpV[1] = __shfl_up_sync(mask, heapArr->warpV[1], 1); + break; + } + } + } + } +} + +template +__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( + const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn, + const IdxT m, const IdxT n, const IdxT k, const IdxT lda, const IdxT ldb, + const IdxT ldd, CoreLambda core_op, FinalLambda fin_op, bool sqrt, + unsigned int numOfNN, int *mutexes, OutT *out_dists, IdxT *out_inds) { + extern __shared__ char smem[]; + + typedef cub::KeyValuePair Pair; + constexpr auto identity = std::numeric_limits::max(); + constexpr auto keyMax = std::numeric_limits::max(); + constexpr auto Dir = false; + typedef faiss::gpu::WarpSelect< + AccT, uint32_t, Dir, faiss::gpu::Comparator, NumWarpQ, NumThreadQ, 32> + myWarpSelect; + + auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, + mutexes] __device__(IdxT gridStrideY) { + if (gridDim.x == 1) { + return; + } + + volatile int *mutex = mutexes; + Pair *shDumpKV = (Pair *)(&smem[Policy::SmemSize]); + const int lid = threadIdx.x % warpSize; + const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); + + // 0 -> consumer done consuming the buffer. + // -1 -> consumer started consuming the buffer + // -2 -> producer done filling the buffer + // blockIdx.x -> prod started to fill the buffer + if (blockIdx.x == 0) { + auto cta_processed = 0; + myWarpSelect heapArr1(identity, keyMax, numOfNN); + myWarpSelect heapArr2(identity, keyMax, numOfNN); + myWarpSelect *heapArr[] = {&heapArr1, &heapArr2}; + __syncthreads(); + + loadAllWarpQShmem(heapArr, &shDumpKV[0], m, numOfNN); + + while (cta_processed < gridDim.x - 1) { + Pair otherKV[Policy::AccRowsPerTh]; + + if (threadIdx.x == 0) { + int32_t old = -3; + while (old != -1) { + old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], -2, -1); + } + __threadfence(); + } + __syncthreads(); + +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto rowId = starty + i * Policy::AccThRows; + otherKV[i].value = identity; + otherKV[i].key = keyMax; + + if (lid < numOfNN && rowId < m) { + otherKV[i].value = out_dists[rowId * numOfNN + lid]; + otherKV[i].key = (uint32_t)out_inds[rowId * numOfNN + lid]; + } + } + __threadfence(); + + if (threadIdx.x == 0) { + mutex[gridStrideY / Policy::Mblk] = 0; + __threadfence(); + } + + // Perform merging of otherKV with topk's across warp. +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto rowId = starty + i * Policy::AccThRows; + if (rowId < m) { + heapArr[i]->add(otherKV[i].value, otherKV[i].key); + } + } + + cta_processed++; + } +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto rowId = starty + i * Policy::AccThRows; + if (rowId < m) { + bool needSort = (heapArr[i]->numVals > 0); + needSort = __any_sync(0xffffffff, needSort); + if (needSort) { + heapArr[i]->reduce(); + } + } + } + storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, + starty); + } else { + if (threadIdx.x == 0) { + int32_t old = -1; + int32_t blkIdX = (int32_t)blockIdx.x; + while (old != blkIdX) { + old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX); + } + __threadfence(); + } + __syncthreads(); + +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto rowId = starty + i * Policy::AccThRows; + const auto shMemRowId = + (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + if (rowId < m) { + for (int idx = lid; idx < numOfNN; idx += warpSize) { + Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx]; + out_dists[rowId * numOfNN + idx] = KVPair.value; + out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key; + } + } + } + __threadfence(); + + if (threadIdx.x == 0) { + mutex[gridStrideY / Policy::Mblk] = -2; + __threadfence(); + } + } + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = + [numOfNN, sqrt, m, n, ldd, out_dists, out_inds] __device__( + AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], DataT * regxn, + DataT * regyn, IdxT gridStrideX, IdxT gridStrideY) { + if (sqrt) { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = raft::mySqrt(acc[i][j]); + } + } + } + Pair *shDumpKV = (Pair *)(&smem[Policy::SmemSize]); + + constexpr uint32_t mask = 0xffffffffu; + const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); + const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols); + const int lid = raft::laneId(); + + myWarpSelect heapArr1(identity, keyMax, numOfNN); + myWarpSelect heapArr2(identity, keyMax, numOfNN); + myWarpSelect *heapArr[] = {&heapArr1, &heapArr2}; + if (usePrevTopKs) { + if (gridStrideX == blockIdx.x * Policy::Nblk) { + loadPrevTopKsGmemWarpQ(heapArr, out_dists, out_inds, m, + numOfNN, starty); + } + } + + if (gridStrideX > blockIdx.x * Policy::Nblk) { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto rowId = + (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1]; + heapArr[i]->warpKTop = tempKV.value; + } + + // total vals can atmost be 256, (32*8) + int numValsWarpTopK[Policy::AccRowsPerTh]; + int anyWarpTopKs = 0; +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto rowId = starty + i * Policy::AccThRows; + numValsWarpTopK[i] = 0; + if (rowId < m) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + const auto colId = startx + j * Policy::AccThCols; + if (colId < ldd) { + if (acc[i][j] < heapArr[i]->warpKTop) { + numValsWarpTopK[i]++; + } + } + } + anyWarpTopKs += numValsWarpTopK[i]; + } + } + anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0); + if (anyWarpTopKs) { + Pair *allWarpTopKs = (Pair *)(&smem[0]); + uint32_t needScanSort[Policy::AccRowsPerTh]; + +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto gmemRowId = starty + i * Policy::AccThRows; + needScanSort[i] = 0; + if (gmemRowId < m) { + int myVals = numValsWarpTopK[i]; + needScanSort[i] = __ballot_sync(mask, myVals > 0); + if (needScanSort[i]) { +#pragma unroll + for (unsigned int k = 1; k <= 16; k *= 2) { + const unsigned int n = + __shfl_up_sync(mask, numValsWarpTopK[i], k); + if (lid >= k) { + numValsWarpTopK[i] += n; + } + } + } + // As each thread will know its total vals to write. + // we only store its starting location. + numValsWarpTopK[i] -= myVals; + } + + if (needScanSort[i]) { + const auto rowId = + (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + if (gmemRowId < m) { + if (needScanSort[i] & ((uint32_t)1 << lid)) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + const auto colId = startx + j * Policy::AccThCols; + if (colId < ldd) { + if (acc[i][j] < heapArr[i]->warpKTop) { + Pair otherKV = {colId, acc[i][j]}; + allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = + otherKV; + numValsWarpTopK[i]++; + } + } + } + } + const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31); + loadWarpQShmem(heapArr[i], &shDumpKV[0], rowId, + numOfNN); + updateSortedWarpQkNumWarpQRegisters>( + heapArr[i], &allWarpTopKs[0], rowId, finalNumVals); + } + } + } + __syncthreads(); +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + if (needScanSort[i]) { + const auto rowId = + (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto gmemRowId = starty + i * Policy::AccThRows; + if (gmemRowId < m) { + storeWarpQShmem(heapArr[i], shDumpKV, rowId, + numOfNN); + } + } + } + } + } else { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + const auto gmemRowId = starty + i * Policy::AccThRows; + const auto shMemRowId = + (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + if (gmemRowId < m) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + const auto colId = startx + j * Policy::AccThCols; + Pair otherKV = {keyMax, identity}; + if (colId < ldd) { + otherKV.value = acc[i][j]; + otherKV.key = colId; + } + heapArr[i]->add(otherKV.value, otherKV.key); + } + + bool needSort = (heapArr[i]->numVals > 0); + needSort = __any_sync(mask, needSort); + if (needSort) { + heapArr[i]->reduce(); + } + storeWarpQShmem(heapArr[i], shDumpKV, shMemRowId, + numOfNN); + } + } + } + + if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) { + // This is last iteration of grid stride X + loadAllWarpQShmem(heapArr, &shDumpKV[0], m, numOfNN); + storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, + starty); + } + }; + + raft::distance::PairwiseDistances + obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, nullptr, smem, core_op, + epilog_lambda, fin_op, rowEpilog_lambda); + obj.run(); +} + +template +void fusedL2kNNImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, + IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists, + IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, + void *workspace, size_t &worksize) { + typedef typename raft::linalg::Policy2x8::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef typename std::conditional::type KPolicy; + + ASSERT(isRowMajor, "Only Row major inputs are allowed"); + + dim3 blk(KPolicy::Nthreads); + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + const auto diff = x - y; + acc += diff * diff; + }; + + auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; }; + + typedef cub::KeyValuePair Pair; + + if (isRowMajor) { + constexpr auto fusedL2kNN32RowMajor = + fusedL2kNN; + constexpr auto fusedL2kNN64RowMajor = + fusedL2kNN; + + auto fusedL2kNNRowMajor = fusedL2kNN32RowMajor; + if (numOfNN <= 32) { + fusedL2kNNRowMajor = fusedL2kNN32RowMajor; + } else if (numOfNN <= 64) { + fusedL2kNNRowMajor = fusedL2kNN64RowMajor; + } else { + ASSERT(numOfNN <= 64, + "fusedL2kNN: num of nearest neighbors must be <= 64"); + } + + dim3 grid = raft::distance::launchConfigGenerator( + m, n, KPolicy::SmemSize, fusedL2kNNRowMajor); + if (grid.x > 1) { + const auto numMutexes = raft::ceildiv(m, KPolicy::Mblk); + if (workspace == nullptr || worksize < (sizeof(int32_t) * numMutexes)) { + worksize = sizeof(int32_t) * numMutexes; + return; + } else { + CUDA_CHECK( + cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream)); + } + } + + const auto sharedMemSize = + KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair)); + + fusedL2kNNRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt, + (uint32_t)numOfNN, (int *)workspace, out_dists, out_inds); + } else { + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void fusedL2kNN(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, bool sqrt, OutT *out_dists, + IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, + void *workspace, size_t &worksize) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + fusedL2kNNImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, + out_inds, numOfNN, stream, workspace, worksize); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + fusedL2kNNImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, + out_inds, numOfNN, stream, workspace, worksize); + } else { + fusedL2kNNImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, + workspace, worksize); + } +} + +/** + * Compute the k-nearest neighbors using L2 unexpanded distance. + + * @tparam value_idx + * @tparam value_t + * @param[out] out_inds output indices array on device (size n_query_rows * k) + * @param[out] out_dists output dists array on device (size n_query_rows * k) + * @param[in] index input index array on device (size n_index_rows * D) + * @param[in] query input query array on device (size n_query_rows * D) + * @param[in] n_index_rows number of rows in index array + * @param[in] n_query_rows number of rows in query array + * @param[in] k number of closest neighbors to return + * @param[in] rowMajorIndex are the index arrays in row-major layout? + * @param[in] rowMajorQuery are the query array in row-major layout? + * @param[in] stream stream to order kernel launch + */ +template +void l2_unexpanded_knn(size_t D, value_idx *out_inds, value_t *out_dists, + const value_t *index, const value_t *query, + size_t n_index_rows, size_t n_query_rows, int k, + bool rowMajorIndex, bool rowMajorQuery, + cudaStream_t stream, void *workspace, size_t &worksize) { + // Validate the input data + ASSERT(k > 0, "l2Knn: k must be > 0"); + ASSERT(D > 0, "l2Knn: D must be > 0"); + ASSERT(n_index_rows > 0, "l2Knn: n_index_rows must be > 0"); + ASSERT(index, "l2Knn: index must be provided (passed null)"); + ASSERT(n_query_rows > 0, "l2Knn: n_query_rows must be > 0"); + ASSERT(query, "l2Knn: query must be provided (passed null)"); + ASSERT(out_dists, "l2Knn: out_dists must be provided (passed null)"); + ASSERT(out_inds, "l2Knn: out_inds must be provided (passed null)"); + // Currently we only support same layout for x & y inputs. + ASSERT(rowMajorIndex == rowMajorQuery, + "l2Knn: rowMajorIndex and rowMajorQuery should have same layout"); + + bool sqrt = (distanceType == raft::distance::DistanceType::L2SqrtUnexpanded); + + if (rowMajorIndex) { + value_idx lda = D, ldb = D, ldd = n_index_rows; + fusedL2kNN( + n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt, + out_dists, out_inds, k, stream, workspace, worksize); + } else { + // TODO: Add support for column major layout + } +} + +} // namespace detail +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 84c130b0e4..94ace19580 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -34,6 +34,7 @@ #include #include +#include "fused_l2_knn.cuh" #include "haversine_distance.cuh" #include "processing.hpp" @@ -269,44 +270,81 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); - switch (metric) { - case raft::distance::DistanceType::Haversine: - - ASSERT(D == 2, - "Haversine distance requires 2 dimensions " - "(latitude / longitude)."); - - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, - k, stream); - break; - default: - faiss::MetricType m = build_faiss_metric(metric); - - faiss::gpu::StandardGpuResources gpu_res; - - gpu_res.noTempMemory(); - gpu_res.setDefaultStream(device, stream); - - faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; - args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; - args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; - - /** - * @todo: Until FAISS supports pluggable allocation strategies, - * we will not reap the benefits of the pool allocator for - * avoiding device-wide synchronizations from cudaMalloc/cudaFree - */ - bfKnn(&gpu_res, args); + if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true && + (metric == raft::distance::DistanceType::L2Unexpanded || + metric == raft::distance::DistanceType::L2SqrtUnexpanded || + metric == raft::distance::DistanceType::L2Expanded || + metric == raft::distance::DistanceType::L2SqrtExpanded)) { + size_t worksize = 0; + void *workspace = nullptr; + + switch (metric) { + case raft::distance::DistanceType::L2Expanded: + case raft::distance::DistanceType::L2Unexpanded: + case raft::distance::DistanceType::L2SqrtExpanded: + // Even for L2 Sqrt distance case we use non-sqrt version + // as FAISS bfKNN only support non-sqrt metric & some tests + // in RAFT/cuML (like Linkage) fails if we use L2 sqrt. + // Even for L2 Sqrt distance case we use non-sqrt version + // as FAISS bfKNN only support non-sqrt metric & some tests + // in RAFT/cuML (like Linkage) fails if we use L2 sqrt. + case raft::distance::DistanceType::L2SqrtUnexpanded: + l2_unexpanded_knn( + D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, + rowMajorIndex, rowMajorQuery, stream, workspace, worksize); + if (worksize) { + rmm::device_uvector d_mutexes(worksize, stream); + workspace = d_mutexes.data(); + l2_unexpanded_knn( + D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, + rowMajorIndex, rowMajorQuery, stream, workspace, worksize); + } + break; + default: + break; + } + } else { + switch (metric) { + case raft::distance::DistanceType::Haversine: + + ASSERT(D == 2, + "Haversine distance requires 2 dimensions " + "(latitude / longitude)."); + + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], + n, k, stream); + break; + default: + faiss::MetricType m = build_faiss_metric(metric); + + faiss::gpu::StandardGpuResources gpu_res; + + gpu_res.noTempMemory(); + gpu_res.setDefaultStream(device, stream); + + faiss::gpu::GpuDistanceParams args; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; + args.vectorsRowMajor = rowMajorIndex; + args.numVectors = sizes[i]; + args.queries = search_items; + args.queriesRowMajor = rowMajorQuery; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; + + /** + * @todo: Until FAISS supports pluggable allocation strategies, + * we will not reap the benefits of the pool allocator for + * avoiding device-wide synchronizations from cudaMalloc/cudaFree + */ + bfKnn(&gpu_res, args); + } } CUDA_CHECK(cudaPeekAtLastError()); From 05b6edeed9b418dbfa8572d1c7024f11fcd6955a Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 21 Sep 2021 12:54:31 -0400 Subject: [PATCH 020/171] Make sure we keep the rapids-cmake and raft cal version in sync (#331) When we make a new raft version, we need to also bump the rapids-cmake version at the same time. Otherwise we will get the previous releases dependencies by mistake. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/331 --- ci/release/update-version.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index fcdb6a2233..45383aaf68 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -30,4 +30,5 @@ function sed_runner() { sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak } -sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt \ No newline at end of file +sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt +sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt \ No newline at end of file From 2ecb227e16443775ade23195a82ee9797ce8ede4 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 23 Sep 2021 16:47:27 -0500 Subject: [PATCH 021/171] Pin max `dask` and `distributed` versions to `2021.09.1` (#334) Changes to be in-line with: https://github.com/rapidsai/cudf/pull/9286/ Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/334 --- ci/gpu/build.sh | 4 ++-- ci/local/old-gpubuild.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 0ba9901107..6f5b5c3c6c 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -59,8 +59,8 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid # Install the master version of dask, distributed, and dask-ml gpuci_logger "Install the master version of dask and distributed" set -x -pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps -pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps +pip install "git+https://github.com/dask/distributed.git@2021.09.1" --upgrade --no-deps +pip install "git+https://github.com/dask/dask.git@2021.09.1" --upgrade --no-deps set +x diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh index efd6c0382a..9ab4486977 100644 --- a/ci/local/old-gpubuild.sh +++ b/ci/local/old-gpubuild.sh @@ -81,8 +81,8 @@ fi # Install the master version of dask, distributed, and dask-ml set -x -pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps -pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps +pip install "git+https://github.com/dask/distributed.git@2021.09.1" --upgrade --no-deps +pip install "git+https://github.com/dask/dask.git@2021.09.1" --upgrade --no-deps set +x From 9b4adb1e3f5e3d76db9cb3b1dc971c63617b7da1 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 23 Sep 2021 18:46:08 -0400 Subject: [PATCH 022/171] Random Ball Cover Algorithm for 2D Haversine/Euclidean (#213) This PR is a proof of concept to use the triangle inequality to prune the tree of exhaustive distance computations into something smaller, such as on the order of where c is called an expansion constant, based on the dimensionality. This should (hopefully) be able to benefit both sparse and dense k-nearest neighbors and all algorithms that use them, hopefully providing a significant speedup for our sparse semirings primitive when only the k-nearest neighbors are desired. The goal here is to construct a tree out of the random ball cover algorithm such that we can utilize it in algorithms which would otherwise be able to make efficient use of a ball tree. However, there are additional challenges to this algorithm on the GPU, such as being able to batch the tree lookups. Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - William Hicks (https://github.com/wphicks) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/213 --- cpp/include/raft/cache/cache_util.cuh | 6 +- cpp/include/raft/sparse/linalg/degree.cuh | 10 +- cpp/include/raft/sparse/selection/knn.cuh | 7 +- .../raft/sparse/selection/knn_graph.cuh | 3 +- cpp/include/raft/spatial/knn/ball_cover.hpp | 162 ++++ .../raft/spatial/knn/ball_cover_common.h | 99 +++ .../knn/detail/ann_quantized_faiss.cuh | 2 + .../raft/spatial/knn/detail/ball_cover.cuh | 351 +++++++++ .../spatial/knn/detail/ball_cover/common.cuh | 91 +++ .../knn/detail/ball_cover/registers.cuh | 537 +++++++++++++ .../spatial/knn/detail/block_select_faiss.cuh | 224 ++++++ .../knn/detail/knn_brute_force_faiss.cuh | 37 +- .../knn/detail/selection_faiss.cuh} | 19 +- .../spatial/knn/detail/warp_select_faiss.cuh | 739 ++++++++++++++++++ cpp/include/raft/spatial/knn/knn.hpp | 56 +- cpp/test/CMakeLists.txt | 3 +- cpp/test/spatial/ball_cover.cu | 271 +++++++ cpp/test/spatial/knn.cu | 9 - cpp/test/{sparse => spatial}/selection.cu | 11 +- cpp/test/spatial/spatial_data.h | 27 + cpp/test/test_utils.h | 42 + 21 files changed, 2645 insertions(+), 61 deletions(-) create mode 100644 cpp/include/raft/spatial/knn/ball_cover.hpp create mode 100644 cpp/include/raft/spatial/knn/ball_cover_common.h create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh rename cpp/include/raft/{sparse/selection/selection.cuh => spatial/knn/detail/selection_faiss.cuh} (94%) create mode 100644 cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh create mode 100644 cpp/test/spatial/ball_cover.cu rename cpp/test/{sparse => spatial}/selection.cu (93%) create mode 100644 cpp/test/spatial/spatial_data.h diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh index ce8ef9a095..a65227c402 100644 --- a/cpp/include/raft/cache/cache_util.cuh +++ b/cpp/include/raft/cache/cache_util.cuh @@ -41,9 +41,9 @@ namespace cache { * @param [in] n the number of elements that need to be collected * @param [out] out vectors collected from the cache, size [n_vec * n] */ -template -__global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx, - int n, math_t *out) { +template +__global__ void get_vecs(const math_t *cache, int_t n_vec, + const idx_t *cache_idx, int_t n, math_t *out) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh index 9bd322c90a..ef6a067c39 100644 --- a/cpp/include/raft/sparse/linalg/degree.cuh +++ b/cpp/include/raft/sparse/linalg/degree.cuh @@ -43,11 +43,11 @@ namespace linalg { * @param nnz the size of the rows array * @param results array to place results */ -template -__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { +template +__global__ void coo_degree_kernel(const T *rows, int nnz, T *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < nnz) { - raft::myAtomicAdd(results + rows[row], 1); + atomicAdd(results + rows[row], (T)1); } } @@ -59,8 +59,8 @@ __global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { * @param results: output result array * @param stream: cuda stream to use */ -template -void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) { +template +void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) { dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index 3566939bc4..49573a679d 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -31,8 +31,6 @@ #include #include #include -#include - #include #include @@ -339,8 +337,9 @@ class sparse_knn_t { if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; // kernel to slice first (min) k cols and copy into batched merge buffer - select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists, - out_indices, ascending, n_neighbors, handle.get_stream()); + raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows, + batch_cols, out_dists, out_indices, ascending, + n_neighbors, handle.get_stream()); } void compute_distances(csr_batcher_t &idx_batcher, diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 1cdd66f516..3df1c77081 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -88,11 +88,12 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { * @param[in] n number of observations (columns) in X * @param[in] metric distance metric to use when constructing neighborhoods * @param[out] out output edge list + * @param[out] out output edge list * @param c */ template void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, - distance::DistanceType metric, + raft::distance::DistanceType metric, raft::sparse::COO &out, int c = 15) { int k = build_k(m, c); diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp new file mode 100644 index 0000000000..e4b50c77e3 --- /dev/null +++ b/cpp/include/raft/spatial/knn/ball_cover.hpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include "ball_cover_common.h" +#include "detail/ball_cover.cuh" +#include "detail/ball_cover/common.cuh" + +namespace raft { +namespace spatial { +namespace knn { + +template +void rbc_build_index(const raft::handle_t &handle, + BallCoverIndex &index) { + ASSERT(index.n == 2, + "Random ball cover currently only works in 2-dimensions"); + if (index.metric == raft::distance::DistanceType::Haversine) { + detail::rbc_build_index(handle, index, detail::HaversineFunc()); + } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || + index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { + detail::rbc_build_index(handle, index, detail::EuclideanFunc()); + } else { + RAFT_FAIL("Metric not support"); + } + + index.set_index_trained(); +} + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * performs an all neighbors knn, which can reuse memory when + * the index and query are the same array. This function will + * build the index and assumes rbc_build_index() has not already + * been called. + * @tparam value_idx knn index type + * @tparam value_t knn distance type + * @tparam value_int type for integers, such as number of rows/cols + * @param handle raft handle for resource management + * @param index ball cover index which has not yet been built + * @param k number of nearest neighbors to find + * @param perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +template +void rbc_all_knn_query(const raft::handle_t &handle, + BallCoverIndex &index, + value_int k, value_idx *inds, value_t *dists, + bool perform_post_filtering = true, float weight = 1.0) { + ASSERT(index.n == 2, + "Random ball cover currently only works in 2-dimensions"); + if (index.metric == raft::distance::DistanceType::Haversine) { + detail::rbc_all_knn_query(handle, index, k, inds, dists, + detail::HaversineFunc(), perform_post_filtering, + weight); + } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || + index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { + detail::rbc_all_knn_query(handle, index, k, inds, dists, + detail::EuclideanFunc(), perform_post_filtering, + weight); + } else { + RAFT_FAIL("Metric not supported"); + } + + index.set_index_trained(); +} + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * function does not build the index and assumes rbc_build_index() has + * already been called. Use this function when the index and + * query arrays are different, otherwise use rbc_all_knn_query(). + * @tparam value_idx index type + * @tparam value_t distances type + * @tparam value_int integer type for size info + * @param handle raft handle for resource management + * @param index ball cover index which has not yet been built + * @param k number of nearest neighbors to find + * @param query the + * @param perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + * @param k + * @param inds + * @param dists + * @param n_samples + */ +template +void rbc_knn_query(const raft::handle_t &handle, + BallCoverIndex &index, + value_int k, const value_t *query, value_int n_query_pts, + value_idx *inds, value_t *dists, + bool perform_post_filtering = true, float weight = 1.0) { + ASSERT(index.n == 2, + "Random ball cover currently only works in 2-dimensions"); + if (index.metric == raft::distance::DistanceType::Haversine) { + detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists, + detail::HaversineFunc(), perform_post_filtering, + weight); + } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || + index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { + detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists, + detail::EuclideanFunc(), perform_post_filtering, + weight); + } else { + RAFT_FAIL("Metric not supported"); + } +} + +// TODO: implement functions for: +// 4. rbc_eps_neigh() - given a populated index, perform query against different query array +// 5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data + +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h new file mode 100644 index 0000000000..ca614bb0cb --- /dev/null +++ b/cpp/include/raft/spatial/knn/ball_cover_common.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace spatial { +namespace knn { + +/** + * Stores raw index data points, sampled landmarks, the 1-nns of index points + * to their closest landmarks, and the ball radii of each landmark. This + * class is intended to be constructed once and reused across subsequent + * queries. + * @tparam value_idx + * @tparam value_t + * @tparam value_int + */ +template +class BallCoverIndex { + public: + explicit BallCoverIndex(const raft::handle_t &handle_, const value_t *X_, + value_int m_, value_int n_, + raft::distance::DistanceType metric_) + : handle(handle_), + X(X_), + m(m_), + n(n_), + metric(metric_), + /** + * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound + * + * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) + */ + n_landmarks(sqrt(m_)), + R_indptr(sqrt(m_) + 1, handle.get_stream()), + R_1nn_cols(m_, handle.get_stream()), + R_1nn_dists(m_, handle.get_stream()), + R(sqrt(m_) * n_, handle.get_stream()), + R_radius(sqrt(m_), handle.get_stream()), + index_trained(false) {} + + value_idx *get_R_indptr() { return R_indptr.data(); } + value_idx *get_R_1nn_cols() { return R_1nn_cols.data(); } + value_t *get_R_1nn_dists() { return R_1nn_dists.data(); } + value_t *get_R_radius() { return R_radius.data(); } + value_t *get_R() { return R.data(); } + const value_t *get_X() { return X; } + + bool is_index_trained() const { return index_trained; }; + + // This should only be set by internal functions + void set_index_trained() { index_trained = true; } + + const raft::handle_t &handle; + + const value_int m; + const value_int n; + const value_int n_landmarks; + + const value_t *X; + + raft::distance::DistanceType metric; + + private: + // CSR storing the neighborhoods for each data point + rmm::device_uvector R_indptr; + rmm::device_uvector R_1nn_cols; + rmm::device_uvector R_1nn_dists; + + rmm::device_uvector R_radius; + + rmm::device_uvector R; + + protected: + bool index_trained; +}; +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 77ad4afe96..0e91b5225d 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -17,12 +17,14 @@ #pragma once #include "../ann_common.h" +#include "knn_brute_force_faiss.cuh" #include "common_faiss.h" #include "processing.hpp" #include #include +#include "processing.hpp" #include